diff --git a/.clangd b/.clangd
index 49cceb761a7..2963e5891df 100644
--- a/.clangd
+++ b/.clangd
@@ -51,6 +51,7 @@ CompileFlags:
     # strip CUDA flags unknown to clang
     - "-ccbin*"
     - "--compiler-options*"
+    - "--extended-lambda"
     - "--expt-extended-lambda"
     - "--expt-relaxed-constexpr"
     - "-forward-unknown-to-host-compiler"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 24864c6631a..65a57ee3469 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,20 +1,29 @@
-# general codeowners for all files
-# (Order matters. This needs to be at the top)
-* @nvidia/cccl-codeowners
-
 # Libraries
-thrust/ @nvidia/cccl-thrust-codeowners @nvidia/cccl-codeowners
-cub/ @nvidia/cccl-cub-codeowners @nvidia/cccl-codeowners
-libcudacxx/ @nvidia/cccl-libcudacxx-codeowners @nvidia/cccl-codeowners
+thrust/ @nvidia/cccl-thrust-codeowners
+cub/ @nvidia/cccl-cub-codeowners
+libcudacxx/ @nvidia/cccl-libcudacxx-codeowners
 cudax/ @nvidia/cccl-cudax-codeowners
 c/ @nvidia/cccl-c-codeowners
 python/ @nvidia/cccl-python-codeowners
 
 # Infrastructure
-.github/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners
-ci/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners
-.devcontainer/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners
+.github/ @nvidia/cccl-infra-codeowners
+ci/ @nvidia/cccl-infra-codeowners
+.devcontainer/ @nvidia/cccl-infra-codeowners
+.pre-commit-config.yaml @nvidia/cccl-infra-codeowners
+.clang-format @nvidia/cccl-infra-codeowners
+.clangd @nvidia/cccl-infra-codeowners
+c2h/ @nvidia/cccl-infra-codeowners
+.vscode @nvidia/cccl-infra-codeowners
 
 # cmake
-**/CMakeLists.txt @nvidia/cccl-cmake-codeowners @nvidia/cccl-codeowners
-**/cmake/ @nvidia/cccl-cmake-codeowners  @nvidia/cccl-codeowners
+**/CMakeLists.txt @nvidia/cccl-cmake-codeowners
+**/cmake/ @nvidia/cccl-cmake-codeowners
+
+# benchmarks
+benchmarks/ @nvidia/cccl-benchmark-codeowners
+**/benchmarks @nvidia/cccl-benchmark-codeowners
+
+# docs
+docs/ @nvidia/cccl-docs-codeowners
+examples/ @nvidia/cccl-docs-codeowners
diff --git a/.github/actions/docs-build/action.yml b/.github/actions/docs-build/action.yml
index db7f3231742..bf2e3077ebb 100644
--- a/.github/actions/docs-build/action.yml
+++ b/.github/actions/docs-build/action.yml
@@ -54,4 +54,4 @@ runs:
     # Upload docs as pages artifacts
     - name: Upload artifact
       if: ${{ inputs.upload_pages_artifact == 'true' }}
-      uses: actions/upload-pages-artifact@v2
+      uses: actions/upload-pages-artifact@v3
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
index 2b1cff7b4f2..2f655b4fddc 100644
--- a/.github/workflows/build-docs.yml
+++ b/.github/workflows/build-docs.yml
@@ -45,4 +45,4 @@ jobs:
     steps:
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v2
+        uses: actions/deploy-pages@v4
diff --git a/.github/workflows/build-rapids.yml b/.github/workflows/build-rapids.yml
index aaee38b05e7..4ee586d0121 100644
--- a/.github/workflows/build-rapids.yml
+++ b/.github/workflows/build-rapids.yml
@@ -134,6 +134,12 @@ jobs:
               sccache --show-adv-stats
             done
           done
+
+          # Exit with error if any failures occurred
+          if test ${#failures[@]} -ne 0; then
+            exit 1
+          fi
+
           EOF
 
           chmod +x "$RUNNER_TEMP"/ci{,-entrypoint}.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d317e931e78..e61d2f349ea 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,6 +43,17 @@ repos:
     hooks:
     - id: ruff  # linter
     - id: ruff-format  # formatter
+
+  # TOML lint & format
+  - repo: https://github.com/ComPWA/taplo-pre-commit
+    rev: v0.9.3
+    hooks:
+      # See https://github.com/NVIDIA/cccl/issues/3426
+      # - id: taplo-lint
+      #   exclude: "^docs/"
+      - id: taplo-format
+        exclude: "^docs/"
+
   - repo: https://github.com/codespell-project/codespell
     rev: v2.3.0
     hooks:
diff --git a/CMakePresets.json b/CMakePresets.json
index bd10a95200b..dcaf9b75977 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -73,8 +73,6 @@
         "CUB_ENABLE_DIALECT_CPP20": true,
         "THRUST_ENABLE_MULTICONFIG": true,
         "THRUST_MULTICONFIG_WORKLOAD": "LARGE",
-        "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": true,
-        "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": true,
         "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17": true,
         "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": true,
         "THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP": true,
@@ -128,28 +126,6 @@
         "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true
       }
     },
-    {
-      "name": "libcudacxx-cpp11",
-      "displayName": "libcu++: C++11",
-      "inherits": "libcudacxx-base",
-      "cacheVariables": {
-        "CMAKE_CXX_STANDARD": "11",
-        "CMAKE_CUDA_STANDARD": "11",
-        "LIBCUDACXX_TEST_STANDARD_VER": "c++11",
-        "CCCL_IGNORE_DEPRECATED_CPP_11": true
-      }
-    },
-    {
-      "name": "libcudacxx-cpp14",
-      "displayName": "libcu++: C++14",
-      "inherits": "libcudacxx-base",
-      "cacheVariables": {
-        "CMAKE_CXX_STANDARD": "14",
-        "CMAKE_CUDA_STANDARD": "14",
-        "LIBCUDACXX_TEST_STANDARD_VER": "c++14",
-        "CCCL_IGNORE_DEPRECATED_CPP_14": true
-      }
-    },
     {
       "name": "libcudacxx-cpp17",
       "displayName": "libcu++: C++17",
@@ -179,28 +155,6 @@
         "CMAKE_CUDA_ARCHITECTURES": "70"
       }
     },
-    {
-      "name": "libcudacxx-nvrtc-cpp11",
-      "displayName": "libcu++ NVRTC: C++11",
-      "inherits": "libcudacxx-nvrtc-base",
-      "cacheVariables": {
-        "CMAKE_CXX_STANDARD": "11",
-        "CMAKE_CUDA_STANDARD": "11",
-        "LIBCUDACXX_TEST_STANDARD_VER": "c++11",
-        "CCCL_IGNORE_DEPRECATED_CPP_11": true
-      }
-    },
-    {
-      "name": "libcudacxx-nvrtc-cpp14",
-      "displayName": "libcu++ NVRTC: C++14",
-      "inherits": "libcudacxx-nvrtc-base",
-      "cacheVariables": {
-        "CMAKE_CXX_STANDARD": "14",
-        "CMAKE_CUDA_STANDARD": "14",
-        "LIBCUDACXX_TEST_STANDARD_VER": "c++14",
-        "CCCL_IGNORE_DEPRECATED_CPP_14": true
-      }
-    },
     {
       "name": "libcudacxx-nvrtc-cpp17",
       "displayName": "libcu++ NVRTC: C++17",
@@ -261,8 +215,6 @@
         "THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA": true,
         "THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP": true,
         "THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true,
-        "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": false,
-        "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": false,
         "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17": false,
         "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": false
       }
@@ -420,22 +372,6 @@
         "libcudacxx.test.atomics.ptx"
       ]
     },
-    {
-      "name": "libcudacxx-nvrtc-cpp11",
-      "hidden": false,
-      "inherits": [
-        "libcudacxx-nvrtcc"
-      ],
-      "configurePreset": "libcudacxx-nvrtc-cpp11"
-    },
-    {
-      "name": "libcudacxx-nvrtc-cpp14",
-      "hidden": false,
-      "inherits": [
-        "libcudacxx-nvrtcc"
-      ],
-      "configurePreset": "libcudacxx-nvrtc-cpp14"
-    },
     {
       "name": "libcudacxx-nvrtc-cpp17",
       "hidden": false,
@@ -452,20 +388,6 @@
       ],
       "configurePreset": "libcudacxx-nvrtc-cpp20"
     },
-    {
-      "name": "libcudacxx-cpp11",
-      "configurePreset": "libcudacxx-cpp11",
-      "inherits": [
-        "libcudacxx-base"
-      ]
-    },
-    {
-      "name": "libcudacxx-cpp14",
-      "configurePreset": "libcudacxx-cpp14",
-      "inherits": [
-        "libcudacxx-base"
-      ]
-    },
     {
       "name": "libcudacxx-cpp17",
       "configurePreset": "libcudacxx-cpp17",
@@ -572,20 +494,6 @@
         "outputOnFailure": false
       }
     },
-    {
-      "name": "libcudacxx-lit-cpp11",
-      "configurePreset": "libcudacxx-cpp11",
-      "inherits": [
-        "libcudacxx-lit-base"
-      ]
-    },
-    {
-      "name": "libcudacxx-lit-cpp14",
-      "configurePreset": "libcudacxx-cpp14",
-      "inherits": [
-        "libcudacxx-lit-base"
-      ]
-    },
     {
       "name": "libcudacxx-lit-cpp17",
       "configurePreset": "libcudacxx-cpp17",
@@ -607,20 +515,6 @@
         "libcudacxx-lit-base"
       ]
     },
-    {
-      "name": "libcudacxx-nvrtc-cpp11",
-      "configurePreset": "libcudacxx-nvrtc-cpp11",
-      "inherits": [
-        "libcudacxx-nvrtc-base"
-      ]
-    },
-    {
-      "name": "libcudacxx-nvrtc-cpp14",
-      "configurePreset": "libcudacxx-nvrtc-cpp14",
-      "inherits": [
-        "libcudacxx-nvrtc-base"
-      ]
-    },
     {
       "name": "libcudacxx-nvrtc-cpp17",
       "configurePreset": "libcudacxx-nvrtc-cpp17",
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index 703a7ead85b..54627d06868 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -160,7 +160,7 @@ std::string get_single_tile_kernel_name(
   check(nvrtcGetTypeName<op_wrapper>(&reduction_op_t));
 
   return std::format(
-    "cub::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>",
+    "cub::detail::reduce::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>",
     chained_policy_t,
     input_iterator_t,
     output_iterator_t,
@@ -192,7 +192,7 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it
   check(nvrtcGetTypeName<cuda::std::__identity>(&transform_op_t));
 
   return std::format(
-    "cub::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>",
+    "cub::detail::reduce::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>",
     chained_policy_t,
     input_iterator_t,
     offset_t,
diff --git a/c/parallel/test/test_main.cpp b/c/parallel/test/test_main.cpp
index 3e3b4900a5d..d1fb01d96bd 100644
--- a/c/parallel/test/test_main.cpp
+++ b/c/parallel/test/test_main.cpp
@@ -12,8 +12,7 @@
 
 #include <iostream>
 
-#define CATCH_CONFIG_RUNNER
-#include <catch2/catch.hpp>
+#include <catch2/catch_session.hpp>
 
 int device_guard(int device_id)
 {
@@ -40,7 +39,7 @@ int main(int argc, char* argv[])
   int device_id{};
 
   // Build a new parser on top of Catch's
-  using namespace Catch::clara;
+  using namespace Catch::Clara;
   auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use");
   session.cli(cli);
 
diff --git a/c/parallel/test/test_util.h b/c/parallel/test/test_util.h
index 3f7010a3e36..456a717c4d8 100644
--- a/c/parallel/test/test_util.h
+++ b/c/parallel/test/test_util.h
@@ -22,7 +22,9 @@
 #include <type_traits>
 #include <vector>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators_all.hpp>
 #include <cccl/c/reduce.h>
 #include <nvrtc.h>
 
diff --git a/c2h/include/c2h/catch2_main.h b/c2h/include/c2h/catch2_main.h
index dc1fa2eba16..8005d33a649 100644
--- a/c2h/include/c2h/catch2_main.h
+++ b/c2h/include/c2h/catch2_main.h
@@ -36,13 +36,9 @@
 //! executable, this header is included into each test. On the other hand, when all the tests are compiled into a single
 //! executable, this header is excluded from the tests and included into catch2_runner.cpp
 
-#ifdef CUB_CONFIG_MAIN
-#  define CATCH_CONFIG_RUNNER
-#endif
-
-#include <catch2/catch.hpp>
+#include <catch2/catch_session.hpp>
 
-#if defined(CUB_CONFIG_MAIN)
+#ifdef CUB_CONFIG_MAIN
 #  if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 #    include <c2h/catch2_runner_helper.h>
 
@@ -59,7 +55,7 @@ int main(int argc, char* argv[])
   int device_id{};
 
   // Build a new parser on top of Catch's
-  using namespace Catch::clara;
+  using namespace Catch::Clara;
   auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use");
   session.cli(cli);
 
@@ -73,4 +69,4 @@ int main(int argc, char* argv[])
 #  endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
   return session.run(argc, argv);
 }
-#endif
+#endif // CUB_CONFIG_MAIN
diff --git a/c2h/include/c2h/catch2_test_helper.h b/c2h/include/c2h/catch2_test_helper.h
index 729c1f43ebd..585b24a70b6 100644
--- a/c2h/include/c2h/catch2_test_helper.h
+++ b/c2h/include/c2h/catch2_test_helper.h
@@ -39,15 +39,37 @@
 #include <tuple>
 #include <type_traits>
 
-#if __CUDACC_VER_MAJOR__ == 11
-_CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variables
-#endif // nvcc-11
-
 #include <c2h/catch2_main.h>
 #include <c2h/device_policy.h>
 #include <c2h/test_util_vec.h>
 #include <c2h/utility.h>
 #include <c2h/vector.h>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators_all.hpp>
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/matchers/catch_matchers_vector.hpp>
+
+// workaround for error #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop'
+#if _CCCL_COMPILER(NVHPC)
+#  undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma("diag push")
+#  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma("diag pop")
+#endif
+// workaround for error
+// * MSVC14.39: #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop'
+// * MSVC14.29: internal error: assertion failed: alloc_copy_of_pending_pragma: copied pragma has source sequence entry
+//              (pragma.c, line 526 in alloc_copy_of_pending_pragma)
+// see also upstream Catch2 issue: https://github.com/catchorg/Catch2/issues/2636
+#if _CCCL_COMPILER(MSVC)
+#  undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#  undef CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
+#  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
+#endif
 
 #ifndef VAR_IDX
 #  define VAR_IDX 0
@@ -110,11 +132,11 @@ std::vector<T> to_vec(std::vector<T> const& vec)
 }
 } // namespace detail
 
-#define REQUIRE_APPROX_EQ(ref, out)                \
-  {                                                \
-    auto vec_ref = detail::to_vec(ref);            \
-    auto vec_out = detail::to_vec(out);            \
-    REQUIRE_THAT(vec_ref, Catch::Approx(vec_out)); \
+#define REQUIRE_APPROX_EQ(ref, out)                          \
+  {                                                          \
+    auto vec_ref = detail::to_vec(ref);                      \
+    auto vec_out = detail::to_vec(out);                      \
+    REQUIRE_THAT(vec_ref, Catch::Matchers::Approx(vec_out)); \
   }
 
 namespace detail
@@ -140,7 +162,7 @@ struct bitwise_equal
 
 // Catch2 Matcher that calls `std::equal` with a default-constructable custom predicate
 template <typename Range, typename Pred>
-struct CustomEqualsRangeMatcher : Catch::MatcherBase<Range>
+struct CustomEqualsRangeMatcher : Catch::Matchers::MatcherBase<Range>
 {
   CustomEqualsRangeMatcher(Range const& range)
       : range{range}
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 881f553f65d..3441ab07b15 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -13,12 +13,11 @@ workflows:
     # Old CTK/compiler
     - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang14', 'msvc2019']}
     # Current CTK build-only
-    - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang14'], project: 'libcudacxx'}
-    - {jobs: ['build'], std: [17], cxx: ['gcc7', 'clang14']}
+    - {jobs: ['build'], std: 17,    cxx: ['gcc7', 'clang14']}
     - {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
     - {jobs: ['build'], std: 'max', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
-    - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']}
+    - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
     - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
     - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
@@ -28,13 +27,13 @@ workflows:
     - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']}
     - {jobs: ['test_lid0'],               project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100',  sm: 'gpu' }
     # Modded builds:
-    - {jobs: ['build'], std: [17, 20], ctk: '12.5', cxx: 'nvhpc'}
+    - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'}
     # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
     - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'}
     # default_projects: clang-cuda
-    - {jobs: ['build'], std: [17, 20], cudacxx: 'clang', cxx: 'clang'}
+    - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
     # nvrtc:
@@ -45,11 +44,11 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20,       cxx: ['msvc14.36']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc10', 'gcc11', 'gcc12']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: [17, 20], cxx: ['nvhpc']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: 'all', cxx: ['nvhpc']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['msvc2022']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17,       cxx: ['gcc'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc'], sm: "90a"}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: [17, 20], cxx: ['gcc', 'clang'], cpu: 'arm64'}
+    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc12', 'clang', 'msvc']}
     # Python and c/parallel jobs:
     - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'}
@@ -122,12 +121,12 @@ workflows:
 devcontainer_version: '25.02'
 
 # All supported C++ standards:
-all_stds: [11, 14, 17, 20]
+all_stds: [17, 20]
 
 ctk_versions:
-  12.0: { stds: [11, 14, 17, 20] }
-  12.5: { stds: [11, 14, 17, 20] }
-  12.6: { stds: [11, 14, 17, 20], aka: 'curr' }
+  12.0: { stds: [17, 20] }
+  12.5: { stds: [17, 20] }
+  12.6: { stds: [17, 20], aka: 'curr' }
 
 device_compilers:
   nvcc: # Version / stds are taken from CTK
@@ -143,37 +142,37 @@ host_compilers:
     container_tag: 'gcc'
     exe: 'g++'
     versions:
-      7:  { stds: [11, 14, 17,   ] }
-      8:  { stds: [11, 14, 17,   ] }
-      9:  { stds: [11, 14, 17,   ] }
-      10: { stds: [11, 14, 17, 20] }
-      11: { stds: [11, 14, 17, 20] }
-      12: { stds: [11, 14, 17, 20] }
-      13: { stds: [11, 14, 17, 20] }
+      7:  { stds: [17,   ] }
+      8:  { stds: [17,   ] }
+      9:  { stds: [17,   ] }
+      10: { stds: [17, 20] }
+      11: { stds: [17, 20] }
+      12: { stds: [17, 20] }
+      13: { stds: [17, 20] }
   clang:
     name: 'Clang'
     container_tag: 'llvm'
     exe: 'clang++'
     versions:
-      14: { stds: [11, 14, 17, 20] }
-      15: { stds: [11, 14, 17, 20] }
-      16: { stds: [11, 14, 17, 20] }
-      17: { stds: [11, 14, 17, 20] }
-      18: { stds: [11, 14, 17, 20] }
+      14: { stds: [17, 20] }
+      15: { stds: [17, 20] }
+      16: { stds: [17, 20] }
+      17: { stds: [17, 20] }
+      18: { stds: [17, 20] }
   msvc:
     name: 'MSVC'
     container_tag: 'cl'
     exe: cl
     versions:
-      14.29: { stds: [    14, 17,   ], aka: '2019' }
-      14.36: { stds: [    14, 17, 20]              }
-      14.39: { stds: [    14, 17, 20], aka: '2022' }
+      14.29: { stds: [ 17,   ], aka: '2019' }
+      14.36: { stds: [ 17, 20]              }
+      14.39: { stds: [ 17, 20], aka: '2022' }
   nvhpc:
     name: 'NVHPC'
     container_tag: 'nvhpc'
     exe: nvc++
     versions:
-      24.7: { stds: [11, 14, 17, 20 ] }
+      24.7: { stds: [17, 20 ] }
 
 # Jobs support the following properties:
 #
@@ -234,10 +233,10 @@ jobs:
 projects:
   cccl:
     name: 'CCCL'
-    stds: [11, 14, 17, 20]
+    stds: [17, 20]
   libcudacxx:
     name: 'libcu++'
-    stds: [11, 14, 17, 20]
+    stds: [17, 20]
   cub:
     name: 'CUB'
     stds: [17, 20]
diff --git a/ci/test_python.sh b/ci/test_python.sh
index bd66cc57716..34900fdb8e0 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,25 +8,28 @@ print_environment_details
 
 fail_if_no_gpu
 
-readonly prefix="${BUILD_DIR}/python/"
-export PYTHONPATH="${prefix}:${PYTHONPATH:-}"
+begin_group "⚙️ Existing site-packages"
+pip freeze
+end_group "⚙️ Existing site-packages"
 
-pushd ../python/cuda_cooperative >/dev/null
+for module in cuda_parallel cuda_cooperative; do
 
-run_command "⚙️  Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
+  pushd "../python/${module}" >/dev/null
 
-popd >/dev/null
+  TEMP_VENV_DIR="/tmp/${module}_venv"
+  rm -rf "${TEMP_VENV_DIR}"
+  python -m venv "${TEMP_VENV_DIR}"
+  . "${TEMP_VENV_DIR}/bin/activate"
+  echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
+  run_command "⚙️  Pip install ${module}" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
+  begin_group "⚙️ ${module} site-packages"
+  pip freeze
+  end_group "⚙️ ${module} site-packages"
+  run_command "🚀  Pytest ${module}" python -m pytest -v ./tests
+  deactivate
 
-pushd ../python/cuda_parallel >/dev/null
+  popd >/dev/null
 
-# Temporarily install the package twice to populate include directory as part of the first installation
-# and to let manifest discover these includes during the second installation. Do not forget to remove the
-# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed.
-run_command "⚙️  Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "⚙️  Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
-
-popd >/dev/null
+done
 
 print_time_summary
diff --git a/ci/update_version.sh b/ci/update_version.sh
index c43303449bb..6a25a837d50 100755
--- a/ci/update_version.sh
+++ b/ci/update_version.sh
@@ -37,6 +37,7 @@ CUB_CMAKE_VERSION_FILE="lib/cmake/cub/cub-config-version.cmake"
 LIBCUDACXX_CMAKE_VERSION_FILE="lib/cmake/libcudacxx/libcudacxx-config-version.cmake"
 THRUST_CMAKE_VERSION_FILE="lib/cmake/thrust/thrust-config-version.cmake"
 CUDAX_CMAKE_VERSION_FILE="lib/cmake/cudax/cudax-config-version.cmake"
+CUDA_CCCL_VERSION_FILE="python/cuda_cccl/cuda/cccl/_version.py"
 CUDA_COOPERATIVE_VERSION_FILE="python/cuda_cooperative/cuda/cooperative/_version.py"
 CUDA_PARALLEL_VERSION_FILE="python/cuda_parallel/cuda/parallel/_version.py"
 
@@ -110,6 +111,7 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" "
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)"
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)"
 
+update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$major.$minor.$patch\""
 update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 
diff --git a/cmake/CCCLGetDependencies.cmake b/cmake/CCCLGetDependencies.cmake
index cd7f3c9fc41..1a97d98820b 100644
--- a/cmake/CCCLGetDependencies.cmake
+++ b/cmake/CCCLGetDependencies.cmake
@@ -14,7 +14,7 @@ endmacro()
 
 macro(cccl_get_catch2)
   include("${_cccl_cpm_file}")
-  CPMAddPackage("gh:catchorg/Catch2@2.13.9")
+  CPMAddPackage("gh:catchorg/Catch2@3.8.0")
 endmacro()
 
 macro(cccl_get_fmt)
diff --git a/cub/benchmarks/bench/merge_sort/keys.cu b/cub/benchmarks/bench/merge_sort/keys.cu
index e1d7d165a79..9cca06463bb 100644
--- a/cub/benchmarks/bench/merge_sort/keys.cu
+++ b/cub/benchmarks/bench/merge_sort/keys.cu
@@ -25,6 +25,7 @@
  *
  ******************************************************************************/
 
+#include <cub/detail/choose_offset.cuh>
 #include <cub/device/device_merge_sort.cuh>
 
 #include <nvbench_helper.cuh>
@@ -84,7 +85,7 @@ void keys(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   using value_input_it_t = value_t*;
   using key_it_t         = key_t*;
   using value_it_t       = value_t*;
-  using offset_t         = OffsetT;
+  using offset_t         = cub::detail::choose_offset_t<OffsetT>;
   using compare_op_t     = less_t;
 
 #if !TUNE_BASE
diff --git a/cub/benchmarks/bench/merge_sort/pairs.cu b/cub/benchmarks/bench/merge_sort/pairs.cu
index f0238063efe..7b54cc49863 100644
--- a/cub/benchmarks/bench/merge_sort/pairs.cu
+++ b/cub/benchmarks/bench/merge_sort/pairs.cu
@@ -25,6 +25,7 @@
  *
  ******************************************************************************/
 
+#include <cub/detail/choose_offset.cuh>
 #include <cub/device/device_merge_sort.cuh>
 
 #include <nvbench_helper.cuh>
@@ -81,7 +82,7 @@ void pairs(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
   using value_input_it_t = value_t*;
   using key_it_t         = key_t*;
   using value_it_t       = value_t*;
-  using offset_t         = OffsetT;
+  using offset_t         = cub::detail::choose_offset_t<OffsetT>;
   using compare_op_t     = less_t;
 
 #if !TUNE_BASE
diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index fcd81e660f6..0da5f982561 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -86,10 +86,10 @@ void init_output_partition_buffer(
   FlagsItT d_flags,
   OffsetT num_items,
   T* d_out,
-  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
+  cub::detail::select::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
 {
   const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true);
-  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+  d_partition_out_buffer = cub::detail::select::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
 }
 
 template <typename FlagsItT, typename T, typename OffsetT>
@@ -109,7 +109,7 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
   using offset_t                             = OffsetT;
   constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
   using output_it_t                          = typename ::cuda::std::
-    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
+    conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
index d456e65fc1c..337586d7f94 100644
--- a/cub/benchmarks/bench/partition/if.cu
+++ b/cub/benchmarks/bench/partition/if.cu
@@ -112,10 +112,10 @@ void init_output_partition_buffer(
   OffsetT num_items,
   T* d_out,
   SelectOpT select_op,
-  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
+  cub::detail::select::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
 {
   const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op);
-  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+  d_partition_out_buffer = cub::detail::select::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
 }
 
 template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
@@ -135,7 +135,7 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
   using offset_t                             = OffsetT;
   constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
   using output_it_t                          = typename ::cuda::std::
-    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
+    conditional<use_distinct_out_partitions, cub::detail::select::partition_distinct_output_t<T*, T*>, T*>::type;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu
index b6b9e4fd537..bd04bcf3d43 100644
--- a/cub/benchmarks/bench/radix_sort/keys.cu
+++ b/cub/benchmarks/bench/radix_sort/keys.cu
@@ -28,6 +28,7 @@
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/util_arch.cuh>
 
+#include <cuda/std/functional>
 #include <cuda/std/type_traits>
 
 #include <nvbench_helper.cuh>
@@ -109,7 +110,8 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
   using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
 
-  return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
+  return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
+                            sizeof(typename hist_agent::TempStorage));
 }
 
 template <typename KeyT, typename ValueT, typename OffsetT>
diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu
index 4a9f229bca4..35d589f453e 100644
--- a/cub/benchmarks/bench/radix_sort/pairs.cu
+++ b/cub/benchmarks/bench/radix_sort/pairs.cu
@@ -28,6 +28,7 @@
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/util_arch.cuh>
 
+#include <cuda/std/functional>
 #include <cuda/std/type_traits>
 
 #include <nvbench_helper.cuh>
@@ -107,7 +108,8 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
   using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
 
-  return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
+  return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
+                            sizeof(typename hist_agent::TempStorage));
 }
 
 template <typename KeyT, typename ValueT, typename OffsetT>
diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh
index 9de575d0686..579d3757d3c 100644
--- a/cub/benchmarks/bench/reduce/base.cuh
+++ b/cub/benchmarks/bench/reduce/base.cuh
@@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
   .set_name("base")
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/reduce/max.cu b/cub/benchmarks/bench/reduce/custom.cu
similarity index 81%
rename from cub/benchmarks/bench/reduce/max.cu
rename to cub/benchmarks/bench/reduce/custom.cu
index 791d5bfe167..0203ef60b8c 100644
--- a/cub/benchmarks/bench/reduce/max.cu
+++ b/cub/benchmarks/bench/reduce/custom.cu
@@ -25,11 +25,18 @@
  *
  ******************************************************************************/
 
+// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific
+// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path.
+
+// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. Its main
+// use is to detect regressions.
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = max_t;
+using value_types = all_types;
+using op_t        = max_t;
 #include "base.cuh"
diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu
index 177d7628f6f..50b175f4ca8 100644
--- a/cub/benchmarks/bench/reduce/min.cu
+++ b/cub/benchmarks/bench/reduce/min.cu
@@ -24,14 +24,23 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures.
-//       It specifically uses cuda::minimum<> instead of a user-defined operator.
-#define TUNE_T int16_t
+
+// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<>
+// instead of a user-defined operator, which CUB recognizes to select an optimized code path.
+
+// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<>
+// Tuning parameters found for signed integer types apply equally for unsigned integer types
+// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+?
+
+#include <cuda/functional>
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = ::cuda::minimum<>;
+// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
+using value_types = fundamental_types;
+using op_t        = ::cuda::minimum<>;
 #include "base.cuh"
diff --git a/cub/benchmarks/bench/reduce/sum.cu b/cub/benchmarks/bench/reduce/sum.cu
index 4433724f090..ab65d7fe847 100644
--- a/cub/benchmarks/bench/reduce/sum.cu
+++ b/cub/benchmarks/bench/reduce/sum.cu
@@ -25,11 +25,18 @@
  *
  ******************************************************************************/
 
+// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses
+// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path.
+
+// Tuning parameters found for signed integer types apply equally for unsigned integer types
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = ::cuda::std::plus<>;
+// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
+using value_types = all_types;
+using op_t        = ::cuda::std::plus<>;
 #include "base.cuh"
diff --git a/cub/benchmarks/nvbench_helper/CMakeLists.txt b/cub/benchmarks/nvbench_helper/CMakeLists.txt
index 24b12c12154..bf8581fbf79 100644
--- a/cub/benchmarks/nvbench_helper/CMakeLists.txt
+++ b/cub/benchmarks/nvbench_helper/CMakeLists.txt
@@ -26,10 +26,9 @@ if (CUB_ENABLE_NVBENCH_HELPER_TESTS)
                                                  test/gen_range.cu
                                                  test/gen_entropy.cu
                                                  test/gen_uniform_distribution.cu
-                                                 test/gen_power_law_distribution.cu
-                                                 test/main.cpp)
+                                                 test/gen_power_law_distribution.cu)
     cccl_configure_target(${nvbench_helper_test_target} DIALECT 17)
-    target_link_libraries(${nvbench_helper_test_target} PRIVATE nvbench_helper Catch2::Catch2 Boost::math)
+    target_link_libraries(${nvbench_helper_test_target} PRIVATE nvbench_helper Catch2::Catch2WithMain Boost::math)
     if ("${device_system}" STREQUAL "cpp")
       target_compile_definitions(${nvbench_helper_test_target} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP)
     endif()
diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
index e1928ec8516..8324650d044 100644
--- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
+++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <cub/thread/thread_operators.cuh>
+
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
@@ -50,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
 namespace detail
 {
 
-template <class T, class List>
+template <class List, class... Ts>
 struct push_back
 {};
 
-template <class T, class... As>
-struct push_back<T, nvbench::type_list<As...>>
+template <class... As, class... Ts>
+struct push_back<nvbench::type_list<As...>, Ts...>
 {
-  using type = nvbench::type_list<As..., T>;
+  using type = nvbench::type_list<As..., Ts...>;
 };
 
 } // namespace detail
 
-template <class T, class List>
-using push_back_t = typename detail::push_back<T, List>::type;
+template <class List, class... Ts>
+using push_back_t = typename detail::push_back<List, Ts...>::type;
 
 #ifdef TUNE_OffsetT
 using offset_types = nvbench::type_list<TUNE_OffsetT>;
diff --git a/cub/benchmarks/nvbench_helper/test/gen_entropy.cu b/cub/benchmarks/nvbench_helper/test/gen_entropy.cu
index 967b8ff0e88..12c96154f94 100644
--- a/cub/benchmarks/nvbench_helper/test/gen_entropy.cu
+++ b/cub/benchmarks/nvbench_helper/test/gen_entropy.cu
@@ -36,7 +36,8 @@
 #include <algorithm>
 #include <array>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
 #include <nvbench_helper.cuh>
 
 template <class T>
diff --git a/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu b/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu
index 0d06d308b0b..599bb9293cb 100644
--- a/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu
+++ b/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu
@@ -33,7 +33,8 @@
 
 #include <boost/math/statistics/anderson_darling.hpp>
 #include <boost/math/statistics/univariate_statistics.hpp>
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
 #include <nvbench_helper.cuh>
 
 bool is_normal(thrust::host_vector<double> data)
diff --git a/cub/benchmarks/nvbench_helper/test/gen_range.cu b/cub/benchmarks/nvbench_helper/test/gen_range.cu
index 064e0b2f1d2..f4eba3183b9 100644
--- a/cub/benchmarks/nvbench_helper/test/gen_range.cu
+++ b/cub/benchmarks/nvbench_helper/test/gen_range.cu
@@ -30,7 +30,8 @@
 
 #include <limits>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/generators/catch_generators_all.hpp>
 #include <nvbench_helper.cuh>
 
 using types =
diff --git a/cub/benchmarks/nvbench_helper/test/gen_seed.cu b/cub/benchmarks/nvbench_helper/test/gen_seed.cu
index 3f04b2c88d1..9f27d6931d5 100644
--- a/cub/benchmarks/nvbench_helper/test/gen_seed.cu
+++ b/cub/benchmarks/nvbench_helper/test/gen_seed.cu
@@ -28,7 +28,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/equal.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 #include <nvbench_helper.cuh>
 
 using types =
diff --git a/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu b/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu
index ed09ef7535e..d37ba2b8fb6 100644
--- a/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu
+++ b/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu
@@ -34,7 +34,9 @@
 #include <map>
 
 #include <boost/math/distributions/chi_squared.hpp>
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators_all.hpp>
 #include <nvbench_helper.cuh>
 
 template <typename T>
diff --git a/cub/benchmarks/nvbench_helper/test/main.cpp b/cub/benchmarks/nvbench_helper/test/main.cpp
deleted file mode 100644
index 5dc819f5caa..00000000000
--- a/cub/benchmarks/nvbench_helper/test/main.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#define CATCH_CONFIG_MAIN
-#include <catch2/catch.hpp>
diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh
index 37e1a013193..c19cb90079a 100644
--- a/cub/cub/agent/agent_adjacent_difference.cuh
+++ b/cub/cub/agent/agent_adjacent_difference.cuh
@@ -63,6 +63,11 @@ struct AgentAdjacentDifferencePolicy
   static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
 };
 
+namespace detail
+{
+namespace adjacent_difference
+{
+
 template <typename Policy,
           typename InputIteratorT,
           typename OutputIteratorT,
@@ -138,7 +143,7 @@ struct AgentDifference
       BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (ReadLeft)
     {
@@ -186,7 +191,7 @@ struct AgentDifference
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (IS_LAST_TILE)
     {
@@ -250,4 +255,25 @@ struct AgentDifferenceInit
   }
 };
 
+} // namespace adjacent_difference
+} // namespace detail
+
+template <typename Policy,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename OffsetT,
+          typename InputT,
+          typename OutputT,
+          bool MayAlias,
+          bool ReadLeft>
+using AgentDifference CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                              "interface will be removed.") = detail::adjacent_difference::
+  AgentDifference<Policy, InputIteratorT, OutputIteratorT, DifferenceOpT, OffsetT, InputT, OutputT, MayAlias, ReadLeft>;
+
+template <typename InputIteratorT, typename InputT, typename OffsetT, bool ReadLeft>
+using AgentDifferenceInit CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                                  "interface will be removed.") =
+  detail::adjacent_difference::AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh
index f9d5e8b16a1..2b926f582fe 100644
--- a/cub/cub/agent/agent_batch_memcpy.cuh
+++ b/cub/cub/agent/agent_batch_memcpy.cuh
@@ -60,6 +60,8 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail
 {
+namespace batch_memcpy
+{
 template <bool PTR_IS_FOUR_BYTE_ALIGNED>
 _CCCL_FORCEINLINE _CCCL_DEVICE void
 LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint4& data_out)
@@ -834,7 +836,7 @@ private:
       BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
         .ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op);
     }
-    CTA_SYNC();
+    __syncthreads();
 
     // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
     blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
@@ -996,7 +998,7 @@ private:
 
       // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused
       // with next iteration
-      CTA_SYNC();
+      __syncthreads();
     }
   }
 
@@ -1026,7 +1028,7 @@ public:
     }
 
     // Ensure we can repurpose the BlockLoad's temporary storage
-    CTA_SYNC();
+    __syncthreads();
 
     // Count how many buffers fall into each size-class
     VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes);
@@ -1037,7 +1039,7 @@ public:
       .ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg);
 
     // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids
-    CTA_SYNC();
+    __syncthreads();
 
     // Factor in the per-size-class counts / offsets
     // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset
@@ -1077,7 +1079,7 @@ public:
 
     // Ensure the prefix callback has finished using its temporary storage and that it can be reused
     // in the next stage
-    CTA_SYNC();
+    __syncthreads();
 
     // Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their
     // size
@@ -1085,7 +1087,7 @@ public:
 
     // Ensure all buffers have been partitioned by their size class AND
     // ensure that blev_buffer_offset has been written to shared memory
-    CTA_SYNC();
+    __syncthreads();
 
     // TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem
     InputBufferIt tile_buffer_srcs        = input_buffer_it + buffer_offset;
@@ -1104,7 +1106,7 @@ public:
       tile_id);
 
     // Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers
-    CTA_SYNC();
+    __syncthreads();
 
     // Copy warp-level buffers
     BatchMemcpyWLEVBuffers(
@@ -1172,7 +1174,7 @@ private:
   // buffers
   BLevBlockOffsetTileState blev_block_scan_state;
 };
-
+} // namespace batch_memcpy
 } // namespace detail
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index e454dc837b1..2e98bf76771 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -134,6 +134,11 @@ struct AgentHistogramPolicy
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace histogram
+{
+
 /**
  * @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating
  * in device-wide histogram .
@@ -320,7 +325,7 @@ struct AgentHistogram
     }
 
     // Barrier to make sure all threads are done updating counters
-    CTA_SYNC();
+    __syncthreads();
   }
 
   // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
@@ -350,7 +355,7 @@ struct AgentHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
   {
     // Barrier to make sure all threads are done updating counters
-    CTA_SYNC();
+    __syncthreads();
 
 // Apply privatized bin counts to output bin counts
 #pragma unroll
@@ -690,7 +695,7 @@ struct AgentHistogram
         ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Get next tile
       if (threadIdx.x == 0)
@@ -698,7 +703,7 @@ struct AgentHistogram
         temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       tile_idx = temp_storage.tile_idx;
     }
@@ -914,4 +919,31 @@ struct AgentHistogram
   }
 };
 
+} // namespace histogram
+} // namespace detail
+
+template <typename AgentHistogramPolicyT,
+          int PRIVATIZED_SMEM_BINS,
+          int NUM_CHANNELS,
+          int NUM_ACTIVE_CHANNELS,
+          typename SampleIteratorT,
+          typename CounterT,
+          typename PrivatizedDecodeOpT,
+          typename OutputDecodeOpT,
+          typename OffsetT,
+          int LEGACY_PTX_ARCH = 0>
+using AgentHistogram CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                             "interface will be removed.") =
+  detail::histogram::AgentHistogram<
+    AgentHistogramPolicyT,
+    PRIVATIZED_SMEM_BINS,
+    NUM_CHANNELS,
+    NUM_ACTIVE_CHANNELS,
+    SampleIteratorT,
+    CounterT,
+    PrivatizedDecodeOpT,
+    OutputDecodeOpT,
+    OffsetT,
+    LEGACY_PTX_ARCH>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
index ae457bb954d..9ae14c3e42e 100644
--- a/cub/cub/agent/agent_merge.cuh
+++ b/cub/cub/agent/agent_merge.cuh
@@ -22,7 +22,8 @@
 
 #include <thrust/system/cuda/detail/core/util.h>
 
-#include <cuda/std/__cccl/dialect.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -116,7 +117,7 @@ struct agent_t
     const Offset partition_end = merge_partitions[tile_idx + 1];
 
     const Offset diag0 = items_per_tile * tile_idx;
-    const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile);
+    const Offset diag1 = (::cuda::std::min)(keys1_count + keys2_count, diag0 + items_per_tile);
 
     // compute bounding box for keys1 & keys2
     const Offset keys1_beg = partition_beg;
@@ -129,14 +130,14 @@ struct agent_t
     const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
 
     key_type keys_loc[items_per_thread];
-    gmem_to_reg<threads_per_block, IsFullTile>(
+    merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
       keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
-    reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
-    CTA_SYNC();
+    merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
+    __syncthreads();
 
     // use binary search in shared memory to find merge path for each of thread.
     // we can use int type here, because the number of items in shared memory is limited
-    const int diag0_loc = min<int>(num_keys1 + num_keys2, items_per_thread * threadIdx.x);
+    const int diag0_loc = (::cuda::std::min)(num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));
 
     const int keys1_beg_loc =
       MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
@@ -158,7 +159,7 @@ struct agent_t
       keys_loc,
       indices,
       compare_op);
-    CTA_SYNC();
+    __syncthreads();
 
     // write keys
     if (IsFullTile)
@@ -180,11 +181,12 @@ struct agent_t
 #endif // _CCCL_CUDACC_AT_LEAST(11, 8)
     {
       item_type items_loc[items_per_thread];
-      gmem_to_reg<threads_per_block, IsFullTile>(
+      merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
         items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
-      CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it
-      reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
-      CTA_SYNC();
+      __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
+                       // to it
+      merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
+      __syncthreads();
 
       // gather items from shared mem
 #pragma unroll
@@ -192,7 +194,7 @@ struct agent_t
       {
         items_loc[i] = storage.items_shared[indices[i]];
       }
-      CTA_SYNC();
+      __syncthreads();
 
       // write from reg to gmem
       if (IsFullTile)
@@ -214,7 +216,7 @@ struct agent_t
     const Offset tile_base = tile_idx * items_per_tile;
     // TODO(bgruber): random mixing of int and Offset
     const int items_in_tile =
-      static_cast<int>(cub::min(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
+      static_cast<int>((::cuda::std::min)(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
     if (items_in_tile == items_per_tile)
     {
       consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index dd8b559f2c4..4c74b73baf2 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -45,6 +45,9 @@
 
 #include <thrust/system/cuda/detail/core/util.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 CUB_NAMESPACE_BEGIN
 
 template <int _BLOCK_THREADS,
@@ -63,7 +66,11 @@ struct AgentMergeSortPolicy
   static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
 };
 
-/// \brief This agent is responsible for the initial in-tile sorting.
+namespace detail
+{
+namespace merge_sort
+{
+
 template <typename Policy,
           typename KeyInputIteratorT,
           typename ValueInputIteratorT,
@@ -156,7 +163,7 @@ struct AgentBlockSort
     auto tile_idx     = static_cast<OffsetT>(blockIdx.x);
     auto num_tiles    = static_cast<OffsetT>(gridDim.x);
     auto tile_base    = tile_idx * ITEMS_PER_TILE;
-    int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE});
+    int items_in_tile = (::cuda::std::min)(static_cast<int>(keys_count - tile_base), int{ITEMS_PER_TILE});
 
     if (tile_idx < num_tiles - 1)
     {
@@ -187,7 +194,7 @@ struct AgentBlockSort
         BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local);
       }
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
     KeyT keys_local[ITEMS_PER_THREAD];
@@ -200,7 +207,7 @@ struct AgentBlockSort
       BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local);
     }
 
-    CTA_SYNC();
+    __syncthreads();
     _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
 
     _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
@@ -212,7 +219,7 @@ struct AgentBlockSort
       BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (ping)
     {
@@ -227,7 +234,7 @@ struct AgentBlockSort
 
       _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
       {
-        CTA_SYNC();
+        __syncthreads();
 
         _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
         {
@@ -252,7 +259,7 @@ struct AgentBlockSort
 
       _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
       {
-        CTA_SYNC();
+        __syncthreads();
 
         _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
         {
@@ -335,10 +342,10 @@ struct AgentPartition
     // partition_idx / target_merged_tiles_number
     const OffsetT local_tile_idx = mask & partition_idx;
 
-    const OffsetT keys1_beg = (cub::min)(keys_count, start);
-    const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size));
+    const OffsetT keys1_beg = (::cuda::std::min)(keys_count, start);
+    const OffsetT keys1_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(start, size));
     const OffsetT keys2_beg = keys1_end;
-    const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
+    const OffsetT keys2_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
 
     _CCCL_PDL_GRID_DEPENDENCY_SYNC();
 
@@ -349,7 +356,7 @@ struct AgentPartition
     }
     else
     {
-      const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
+      const OffsetT partition_at = (::cuda::std::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
 
       OffsetT partition_diag =
         ping
@@ -371,8 +378,6 @@ struct AgentPartition
   }
 };
 
-namespace detail
-{
 /**
  * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array
  *
@@ -418,7 +423,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PE
     output[idx]   = input[item];
   }
 }
-} // namespace detail
 
 /// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays.
 template <typename Policy,
@@ -526,15 +530,15 @@ struct AgentMerge
     // diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2)
     // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile
     // and keys1_end is key1's component of that path
-    const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg);
-    OffsetT keys2_end =
-      (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);
+    const OffsetT keys2_beg = (::cuda::std::min)(max_keys2, diag - keys1_beg);
+    OffsetT keys2_end       = (::cuda::std::min)(
+      max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);
 
     // Check if it's the last tile in the tile group being merged
     if (mask == (mask & tile_idx))
     {
-      keys1_end = (cub::min)(keys_count - start, size);
-      keys2_end = (cub::min)(max_keys2, size);
+      keys1_end = (::cuda::std::min)(keys_count - start, size);
+      keys2_end = (::cuda::std::min)(max_keys2, size);
     }
 
     // number of keys per tile
@@ -547,15 +551,15 @@ struct AgentMerge
     KeyT keys_local[ITEMS_PER_THREAD];
     if (ping)
     {
-      detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+      gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
         keys_local, keys_in_ping + start + keys1_beg, keys_in_ping + start + size + keys2_beg, num_keys1, num_keys2);
     }
     else
     {
-      detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+      gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
         keys_local, keys_in_pong + start + keys1_beg, keys_in_pong + start + size + keys2_beg, num_keys1, num_keys2);
     }
-    detail::reg_to_shared<BLOCK_THREADS>(&storage.keys_shared[0], keys_local);
+    reg_to_shared<BLOCK_THREADS>(&storage.keys_shared[0], keys_local);
 
     // preload items into registers already
     //
@@ -565,7 +569,7 @@ struct AgentMerge
     {
       if (ping)
       {
-        detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+        gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
           items_local,
           items_in_ping + start + keys1_beg,
           items_in_ping + start + size + keys2_beg,
@@ -574,7 +578,7 @@ struct AgentMerge
       }
       else
       {
-        detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+        gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
           items_local,
           items_in_pong + start + keys1_beg,
           items_in_pong + start + size + keys2_beg,
@@ -583,7 +587,7 @@ struct AgentMerge
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
     _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
 
     // use binary search in shared memory
@@ -591,7 +595,7 @@ struct AgentMerge
     // we can use int type here, because the number of
     // items in shared memory is limited
     //
-    const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
+    const int diag0_local = (::cuda::std::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
 
     const int keys1_beg_local = MergePath(
       &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op);
@@ -616,7 +620,7 @@ struct AgentMerge
       indices,
       compare_op);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // write keys
     if (ping)
@@ -650,11 +654,11 @@ struct AgentMerge
     _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
 #endif // _CCCL_CUDACC_AT_LEAST(11, 8)
     {
-      CTA_SYNC();
+      __syncthreads();
 
-      detail::reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_local);
+      reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_local);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // gather items from shared mem
       //
@@ -664,7 +668,7 @@ struct AgentMerge
         items_local[item] = storage.items_shared[indices[item]];
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // write from reg to gmem
       //
@@ -731,7 +735,7 @@ struct AgentMerge
     const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
     const int tid           = static_cast<int>(threadIdx.x);
     const int items_in_tile =
-      static_cast<int>((cub::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
+      static_cast<int>((::cuda::std::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
 
     if (tile_idx < num_tiles - 1)
     {
@@ -744,4 +748,45 @@ struct AgentMerge
   }
 };
 
+} // namespace merge_sort
+} // namespace detail
+
+template <typename Policy,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+using AgentBlockSort CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                             "interface will be removed.") =
+  detail::merge_sort::AgentBlockSort<
+    Policy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+
+template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
+using AgentPartition CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::merge_sort::AgentPartition<KeyIteratorT, OffsetT, CompareOpT, KeyT>;
+
+template <typename Policy,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+using AgentMerge CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface "
+                                         "will be removed.") =
+  detail::merge_sort::AgentMerge<Policy, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh
index 43562c9a2b5..cc6e5c18f11 100644
--- a/cub/cub/agent/agent_radix_sort_downsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -124,6 +124,11 @@ struct AgentRadixSortDownsweepPolicy : ScalingType
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace radix_sort
+{
+
 /**
  * @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in
  *        device-wide radix sort downsweep .
@@ -148,14 +153,14 @@ template <typename AgentRadixSortDownsweepPolicy,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
-          typename DecomposerT = detail::identity_decomposer_t>
+          typename DecomposerT = identity_decomposer_t>
 struct AgentRadixSortDownsweep
 {
   //---------------------------------------------------------------------
   // Type definitions and constants
   //---------------------------------------------------------------------
 
-  using traits                 = detail::radix::traits_t<KeyT>;
+  using traits                 = radix::traits_t<KeyT>;
   using bit_ordered_type       = typename traits::bit_ordered_type;
   using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
 
@@ -182,8 +187,7 @@ struct AgentRadixSortDownsweep
   using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;
 
   // Radix ranking type to use
-  using BlockRadixRankT =
-    cub::detail::block_radix_rank_t<RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;
+  using BlockRadixRankT = block_radix_rank_t<RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;
 
   // Digit extractor type
   using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
@@ -277,7 +281,7 @@ struct AgentRadixSortDownsweep
       temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -305,7 +309,7 @@ struct AgentRadixSortDownsweep
     int (&ranks)[ITEMS_PER_THREAD],
     OffsetT valid_items)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     ValueExchangeT& exchange_values = temp_storage.exchange_values.Alias();
 
@@ -315,7 +319,7 @@ struct AgentRadixSortDownsweep
       exchange_values[ranks[ITEM]] = values[ITEM];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -342,7 +346,7 @@ struct AgentRadixSortDownsweep
   {
     BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -362,7 +366,7 @@ struct AgentRadixSortDownsweep
 
     BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items, oob_item);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -409,7 +413,7 @@ struct AgentRadixSortDownsweep
   {
     BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -428,7 +432,7 @@ struct AgentRadixSortDownsweep
 
     BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   /**
@@ -474,7 +478,7 @@ struct AgentRadixSortDownsweep
   {
     ValueT values[ITEMS_PER_THREAD];
 
-    CTA_SYNC();
+    __syncthreads();
 
     LoadValues(values, block_offset, valid_items, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
 
@@ -520,7 +524,7 @@ struct AgentRadixSortDownsweep
     int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
     BlockRadixRankT(temp_storage.radix_rank).RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix);
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Share exclusive digit prefix
 #pragma unroll
@@ -534,7 +538,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Get inclusive digit prefix
     int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
@@ -562,7 +566,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Update global scatter base offsets for each digit
 #pragma unroll
@@ -577,7 +581,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Scatter keys
     ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
@@ -602,7 +606,7 @@ struct AgentRadixSortDownsweep
       T items[ITEMS_PER_THREAD];
 
       LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
 
       block_offset += TILE_ITEMS;
@@ -616,7 +620,7 @@ struct AgentRadixSortDownsweep
       T items[ITEMS_PER_THREAD];
 
       LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
     }
   }
@@ -670,7 +674,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    short_circuit = CTA_SYNC_AND(short_circuit);
+    short_circuit = __syncthreads_and(short_circuit);
   }
 
   /**
@@ -719,7 +723,7 @@ struct AgentRadixSortDownsweep
       }
     }
 
-    short_circuit = CTA_SYNC_AND(short_circuit);
+    short_circuit = __syncthreads_and(short_circuit);
   }
 
   /**
@@ -744,7 +748,7 @@ struct AgentRadixSortDownsweep
         ProcessTile<true>(block_offset);
         block_offset += TILE_ITEMS;
 
-        CTA_SYNC();
+        __syncthreads();
       }
 
       // Clean up last partial tile with guarded-I/O
@@ -756,4 +760,18 @@ struct AgentRadixSortDownsweep
   }
 };
 
+} // namespace radix_sort
+} // namespace detail
+
+template <typename AgentRadixSortDownsweepPolicy,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+using AgentRadixSortDownsweep CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public "
+  "interface will be removed.") = detail::radix_sort::
+  AgentRadixSortDownsweep<AgentRadixSortDownsweepPolicy, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh
index 2785f732450..29580897764 100644
--- a/cub/cub/agent/agent_radix_sort_histogram.cuh
+++ b/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -50,6 +50,8 @@
 #include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 template <int _BLOCK_THREADS, int _ITEMS_PER_THREAD, int NOMINAL_4B_NUM_PARTS, typename ComputeT, int _RADIX_BITS>
@@ -79,11 +81,16 @@ struct AgentRadixSortExclusiveSumPolicy
   };
 };
 
+namespace detail
+{
+namespace radix_sort
+{
+
 template <typename AgentRadixSortHistogramPolicy,
           bool IS_DESCENDING,
           typename KeyT,
           typename OffsetT,
-          typename DecomposerT = detail::identity_decomposer_t>
+          typename DecomposerT = identity_decomposer_t>
 struct AgentRadixSortHistogram
 {
   // constants
@@ -98,7 +105,7 @@ struct AgentRadixSortHistogram
     NUM_PARTS        = AgentRadixSortHistogramPolicy::NUM_PARTS,
   };
 
-  using traits                 = detail::radix::traits_t<KeyT>;
+  using traits                 = radix::traits_t<KeyT>;
   using bit_ordered_type       = typename traits::bit_ordered_type;
   using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
 
@@ -172,7 +179,7 @@ struct AgentRadixSortHistogram
         }
       }
     }
-    CTA_SYNC();
+    __syncthreads();
   }
 
   _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
@@ -199,7 +206,7 @@ struct AgentRadixSortHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void
   AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
   {
-    int part = LaneId() % NUM_PARTS;
+    int part = ::cuda::ptx::get_sreg_laneid() % NUM_PARTS;
 #pragma unroll
     for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
     {
@@ -247,7 +254,7 @@ struct AgentRadixSortHistogram
     {
       // Reset the counters.
       Init();
-      CTA_SYNC();
+      __syncthreads();
 
       // Process the tiles.
       OffsetT portion_offset = portion * MAX_PORTION_SIZE;
@@ -259,11 +266,11 @@ struct AgentRadixSortHistogram
         LoadTileKeys(tile_offset, keys);
         AccumulateSharedHistograms(tile_offset, keys);
       }
-      CTA_SYNC();
+      __syncthreads();
 
       // Accumulate the result in global memory.
       AccumulateGlobalHistograms();
-      CTA_SYNC();
+      __syncthreads();
     }
   }
 
@@ -273,4 +280,17 @@ struct AgentRadixSortHistogram
   }
 };
 
+} // namespace radix_sort
+} // namespace detail
+
+template <typename AgentRadixSortHistogramPolicy,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+using AgentRadixSortHistogram CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public "
+  "interface will be removed.") =
+  detail::radix_sort::AgentRadixSortHistogram<AgentRadixSortHistogramPolicy, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index a78ee66c7b2..331012d36b9 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -49,6 +49,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -96,13 +97,18 @@ struct AgentRadixSortOnesweepPolicy : ScalingType
   static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
 };
 
+namespace detail
+{
+namespace radix_sort
+{
+
 template <typename AgentRadixSortOnesweepPolicy,
           bool IS_DESCENDING,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
           typename PortionOffsetT,
-          typename DecomposerT = detail::identity_decomposer_t>
+          typename DecomposerT = identity_decomposer_t>
 struct AgentRadixSortOnesweep
 {
   // constants
@@ -126,7 +132,7 @@ struct AgentRadixSortOnesweep
     LOOKBACK_VALUE_MASK   = ~LOOKBACK_KIND_MASK,
   };
 
-  using traits                 = detail::radix::traits_t<KeyT>;
+  using traits                 = radix::traits_t<KeyT>;
   using bit_ordered_type       = typename traits::bit_ordered_type;
   using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
 
@@ -279,7 +285,7 @@ struct AgentRadixSortOnesweep
           } while (value_j == 0);
 
           inc_sum += value_j & LOOKBACK_VALUE_MASK;
-          want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
+          want_mask = __ballot_sync(want_mask, (value_j & LOOKBACK_GLOBAL_MASK) == 0);
           if (value_j & LOOKBACK_GLOBAL_MASK)
           {
             break;
@@ -349,7 +355,7 @@ struct AgentRadixSortOnesweep
         short_circuit = short_circuit || bins[u] == TILE_ITEMS;
       }
     }
-    short_circuit = CTA_SYNC_OR(short_circuit);
+    short_circuit = __syncthreads_or(short_circuit);
     if (!short_circuit)
     {
       return;
@@ -377,7 +383,7 @@ struct AgentRadixSortOnesweep
     LoadBinsToOffsetsGlobal(offsets);
     LookbackGlobal(bins);
     UpdateBinsGlobal(bins, offsets);
-    CTA_SYNC();
+    __syncthreads();
 
     // scatter the keys
     OffsetT global_offset = s.global_offsets[common_bin];
@@ -483,7 +489,7 @@ struct AgentRadixSortOnesweep
       {
         d_keys_out[global_idx] = Twiddle::Out(key, decomposer);
       }
-      WARP_SYNC(WARP_MASK);
+      __syncwarp(WARP_MASK);
     }
   }
 
@@ -501,7 +507,7 @@ struct AgentRadixSortOnesweep
       {
         d_values_out[global_idx] = value;
       }
-      WARP_SYNC(WARP_MASK);
+      __syncwarp(WARP_MASK);
     }
   }
 
@@ -527,7 +533,7 @@ struct AgentRadixSortOnesweep
       {
         num_writes -= int(global_idx + 1) % ALIGN;
       }
-      num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK);
+      num_writes = __shfl_sync(WARP_MASK, num_writes, last_lane);
       if (lane < num_writes)
       {
         ThreadStore<CACHE_MODIFIER>(&d_keys_out[global_idx], key_out);
@@ -600,10 +606,10 @@ struct AgentRadixSortOnesweep
     LoadValues(block_idx * TILE_ITEMS, values);
 
     // scatter values
-    CTA_SYNC();
+    __syncthreads();
     ScatterValuesShared(values, ranks);
 
-    CTA_SYNC();
+    __syncthreads();
     ScatterValuesGlobal(digits);
   }
 
@@ -625,7 +631,7 @@ struct AgentRadixSortOnesweep
       .RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix, CountsCallback(*this, bins, keys));
 
     // scatter keys in shared memory
-    CTA_SYNC();
+    __syncthreads();
     ScatterKeysShared(keys, ranks);
 
     // compute global offsets
@@ -634,7 +640,7 @@ struct AgentRadixSortOnesweep
     UpdateBinsGlobal(bins, exclusive_digit_prefix);
 
     // scatter keys in global memory
-    CTA_SYNC();
+    __syncthreads();
     ScatterKeysGlobal();
 
     // scatter values if necessary
@@ -669,7 +675,7 @@ struct AgentRadixSortOnesweep
       , current_bit(current_bit)
       , num_bits(num_bits)
       , warp(threadIdx.x / WARP_THREADS)
-      , lane(LaneId())
+      , lane(::cuda::ptx::get_sreg_laneid())
       , decomposer(decomposer)
   {
     // initialization
@@ -677,10 +683,24 @@ struct AgentRadixSortOnesweep
     {
       s.block_idx = atomicAdd(d_ctrs, 1);
     }
-    CTA_SYNC();
+    __syncthreads();
     block_idx  = s.block_idx;
     full_block = (block_idx + 1) * TILE_ITEMS <= num_items;
   }
 };
 
+} // namespace radix_sort
+} // namespace detail
+
+template <typename AgentRadixSortOnesweepPolicy,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename PortionOffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+using AgentRadixSortOnesweep CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                                     "interface will be removed.") = detail::radix_sort::
+  AgentRadixSortOnesweep<AgentRadixSortOnesweepPolicy, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT, DecomposerT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh
index e91e32c5bd3..cc0c10464f3 100644
--- a/cub/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -52,6 +52,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -98,6 +100,11 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace radix_sort
+{
+
 /**
  * @brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for
  * participating in device-wide radix sort upsweep .
@@ -108,19 +115,19 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
  * @tparam KeyT
  *   KeyT type
  *
- * @tparam DecomposerT = detail::identity_decomposer_t
+ * @tparam DecomposerT = identity_decomposer_t
  *   Signed integer type for global offsets
  */
 template <typename AgentRadixSortUpsweepPolicy,
           typename KeyT,
           typename OffsetT,
-          typename DecomposerT = detail::identity_decomposer_t>
+          typename DecomposerT = identity_decomposer_t>
 struct AgentRadixSortUpsweep
 {
   //---------------------------------------------------------------------
   // Type definitions and constants
   //---------------------------------------------------------------------
-  using traits                 = detail::radix::traits_t<KeyT>;
+  using traits                 = radix::traits_t<KeyT>;
   using bit_ordered_type       = typename traits::bit_ordered_type;
   using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
 
@@ -298,7 +305,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts()
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = LaneId();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();
 
 #pragma unroll
     for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
@@ -331,7 +338,7 @@ struct AgentRadixSortUpsweep
     LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
 
     // Prevent hoisting
-    CTA_SYNC();
+    __syncthreads();
 
     // Bucket tile of keys
     Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
@@ -385,12 +392,12 @@ struct AgentRadixSortUpsweep
         block_offset += TILE_ITEMS;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Aggregate back into local_count registers to prevent overflow
       UnpackDigitCounts();
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reset composite counters in lanes
       ResetDigitCounters();
@@ -406,7 +413,7 @@ struct AgentRadixSortUpsweep
     // Process partial tile if necessary
     ProcessPartialTile(block_offset, block_end);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Aggregate back into local_count registers
     UnpackDigitCounts();
@@ -419,7 +426,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0)
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = LaneId();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();
 
 // Place unpacked digit counters in shared memory
 #pragma unroll
@@ -440,7 +447,7 @@ struct AgentRadixSortUpsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Rake-reduce bin_count reductions
 
@@ -499,7 +506,7 @@ struct AgentRadixSortUpsweep
   _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])
   {
     unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
-    unsigned int warp_tid = LaneId();
+    unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid();
 
 // Place unpacked digit counters in shared memory
 #pragma unroll
@@ -520,7 +527,7 @@ struct AgentRadixSortUpsweep
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Rake-reduce bin_count reductions
 #pragma unroll
@@ -542,4 +549,15 @@ struct AgentRadixSortUpsweep
   }
 };
 
+} // namespace radix_sort
+} // namespace detail
+
+template <typename AgentRadixSortUpsweepPolicy,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+using AgentRadixSortUpsweep CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                                    "interface will be removed.") =
+  detail::radix_sort::AgentRadixSortUpsweep<AgentRadixSortUpsweepPolicy, KeyT, OffsetT, DecomposerT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
index d5e3514f369..35779d0e8a6 100644
--- a/cub/cub/agent/agent_reduce.cuh
+++ b/cub/cub/agent/agent_reduce.cuh
@@ -95,6 +95,11 @@ struct AgentReducePolicy : ScalingType
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace reduce
+{
+
 /**
  * @brief AgentReduce implements a stateful abstraction of CUDA thread blocks
  *        for participating in device-wide reduction .
@@ -136,7 +141,7 @@ struct AgentReduce
   //---------------------------------------------------------------------
 
   /// The input value type
-  using InputT = cub::detail::value_t<InputIteratorT>;
+  using InputT = value_t<InputIteratorT>;
 
   /// Vector type of InputT for data movement
   using VectorT = typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type;
@@ -249,8 +254,7 @@ struct AgentReduce
     AccumT items[ITEMS_PER_THREAD];
 
     // Load items in striped fashion
-    cub::detail::load_transform_direct_striped<BLOCK_THREADS>(
-      threadIdx.x, d_wrapped_in + block_offset, items, transform_op);
+    load_transform_direct_striped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items, transform_op);
 
     // Reduce items within each thread stripe
     thread_aggregate = (IS_FIRST_TILE) ? cub::ThreadReduce(items, reduction_op)
@@ -445,4 +449,18 @@ private:
   }
 };
 
+} // namespace reduce
+} // namespace detail
+
+template <typename AgentReducePolicy,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOp,
+          typename AccumT,
+          typename TransformOp = ::cuda::std::__identity>
+using AgentReduce CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface "
+                                          "will be removed.") = detail::reduce::
+  AgentReduce<AgentReducePolicy, InputIteratorT, OutputIteratorT, OffsetT, ReductionOp, AccumT, TransformOp>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh
index 735993723d8..ac0d9045ab9 100644
--- a/cub/cub/agent/agent_reduce_by_key.cuh
+++ b/cub/cub/agent/agent_reduce_by_key.cuh
@@ -116,6 +116,11 @@ struct AgentReduceByKeyPolicy
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace reduce
+{
+
 /**
  * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread
  *        blocks for participating in device-wide reduce-value-by-key
@@ -167,13 +172,13 @@ struct AgentReduceByKey
   //---------------------------------------------------------------------
 
   // The input keys type
-  using KeyInputT = cub::detail::value_t<KeysInputIteratorT>;
+  using KeyInputT = value_t<KeysInputIteratorT>;
 
   // The output keys type
-  using KeyOutputT = cub::detail::non_void_value_t<UniqueOutputIteratorT, KeyInputT>;
+  using KeyOutputT = non_void_value_t<UniqueOutputIteratorT, KeyInputT>;
 
   // The input values type
-  using ValueInputT = cub::detail::value_t<ValuesInputIteratorT>;
+  using ValueInputT = value_t<ValuesInputIteratorT>;
 
   // Tuple type for scanning (pairs accumulated segment-value with
   // segment-index)
@@ -426,7 +431,7 @@ struct AgentReduceByKey
     OffsetT num_tile_segments,
     OffsetT num_tile_segments_prefix)
   {
-    CTA_SYNC();
+    __syncthreads();
 
 // Compact and scatter pairs
 #pragma unroll
@@ -438,7 +443,7 @@ struct AgentReduceByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
     {
@@ -539,7 +544,7 @@ struct AgentReduceByKey
       tile_predecessor = (tile_idx == 0) ? keys[0] : d_keys_in[tile_offset - 1];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Load values
     if (IS_LAST_TILE)
@@ -551,7 +556,7 @@ struct AgentReduceByKey
       BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Initialize head-flags and shuffle up the previous keys
     if (IS_LAST_TILE)
@@ -694,4 +699,31 @@ struct AgentReduceByKey
   }
 };
 
+} // namespace reduce
+} // namespace detail
+
+template <typename AgentReduceByKeyPolicyT,
+          typename KeysInputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename ValuesInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT,
+          typename AccumT>
+using AgentReduceByKey CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                               "interface will be removed.") =
+  detail::reduce::AgentReduceByKey<
+    AgentReduceByKeyPolicyT,
+    KeysInputIteratorT,
+    UniqueOutputIteratorT,
+    ValuesInputIteratorT,
+    AggregatesOutputIteratorT,
+    NumRunsOutputIteratorT,
+    EqualityOpT,
+    ReductionOpT,
+    OffsetT,
+    AccumT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index 2495d2f5f7a..f8898fa4281 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -54,6 +54,7 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 #include <iterator>
@@ -133,6 +134,11 @@ struct AgentRlePolicy
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace rle
+{
+
 /**
  * @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide
  * run-length-encode
@@ -465,7 +471,7 @@ struct AgentRle
   {
     // Perform warpscans
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = LaneId();
+    int lane_id          = ::cuda::ptx::get_sreg_laneid();
 
     LengthOffsetPair identity;
     identity.key   = 0;
@@ -501,7 +507,7 @@ struct AgentRle
       temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Accumulate total selected and the warp-wide prefix
 
@@ -531,7 +537,7 @@ struct AgentRle
 
     // Ensure all threads have read warp aggregates before temp_storage is repurposed in the
     // subsequent scatter stage
-    CTA_SYNC();
+    __syncthreads();
   }
 
   //---------------------------------------------------------------------
@@ -551,7 +557,7 @@ struct AgentRle
     Int2Type<true> is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = LaneId();
+    int lane_id          = ::cuda::ptx::get_sreg_laneid();
 
     // Locally compact items within the warp (first warp)
     if (warp_id == 0)
@@ -564,7 +570,7 @@ struct AgentRle
 #pragma unroll
     for (int SLICE = 1; SLICE < WARPS; ++SLICE)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == SLICE)
       {
@@ -608,7 +614,7 @@ struct AgentRle
     Int2Type<false> is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = LaneId();
+    int lane_id          = ::cuda::ptx::get_sreg_laneid();
 
     // Unzip
     OffsetT run_offsets[ITEMS_PER_THREAD];
@@ -624,7 +630,7 @@ struct AgentRle
     WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id])
       .ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp);
 
-    WARP_SYNC(0xffffffff);
+    __syncwarp(0xffffffff);
 
     WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id])
       .ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp);
@@ -762,7 +768,7 @@ struct AgentRle
 
       if (SYNC_AFTER_LOAD)
       {
-        CTA_SYNC();
+        __syncthreads();
       }
 
       // Set flags
@@ -848,7 +854,7 @@ struct AgentRle
 
       if (SYNC_AFTER_LOAD)
       {
-        CTA_SYNC();
+        __syncthreads();
       }
 
       // Set flags
@@ -878,7 +884,7 @@ struct AgentRle
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
 
@@ -989,4 +995,17 @@ struct AgentRle
   }
 };
 
+} // namespace rle
+} // namespace detail
+
+template <typename AgentRlePolicyT,
+          typename InputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT>
+using AgentRle CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface "
+                                       "will be removed.") = detail::rle::
+  AgentRle<AgentRlePolicyT, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, EqualityOpT, OffsetT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh
index 7021531d0cc..c3cc02b69a1 100644
--- a/cub/cub/agent/agent_scan.cuh
+++ b/cub/cub/agent/agent_scan.cuh
@@ -112,6 +112,11 @@ struct AgentScanPolicy : ScalingType
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace scan
+{
+
 /**
  * @brief AgentScan implements a stateful abstraction of CUDA thread blocks for
  *        participating in device-wide prefix scan.
@@ -376,7 +381,7 @@ struct AgentScan
       BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Perform tile scan
     if (tile_idx == 0)
@@ -397,7 +402,7 @@ struct AgentScan
       ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Store items
     if (IS_LAST_TILE)
@@ -482,7 +487,7 @@ struct AgentScan
       BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Block scan
     if (IS_FIRST_TILE)
@@ -496,7 +501,7 @@ struct AgentScan
       ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Store items
     if (IS_LAST_TILE)
@@ -582,4 +587,19 @@ struct AgentScan
   }
 };
 
+} // namespace scan
+} // namespace detail
+
+template <typename AgentScanPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename AccumT,
+          bool ForceInclusive = false>
+using AgentScan CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface "
+                                        "will be removed.") = detail::scan::
+  AgentScan<AgentScanPolicyT, InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT, ForceInclusive>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh
index 6e79ca18d8c..722a44ac074 100644
--- a/cub/cub/agent/agent_scan_by_key.cuh
+++ b/cub/cub/agent/agent_scan_by_key.cuh
@@ -94,6 +94,11 @@ struct AgentScanByKeyPolicy
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace scan_by_key
+{
+
 /**
  * @brief AgentScanByKey implements a stateful abstraction of CUDA thread
  *        blocks for participating in device-wide prefix scan by key.
@@ -140,10 +145,10 @@ struct AgentScanByKey
   // Types and constants
   //---------------------------------------------------------------------
 
-  using KeyT               = cub::detail::value_t<KeysInputIteratorT>;
-  using InputT             = cub::detail::value_t<ValuesInputIteratorT>;
+  using KeyT               = value_t<KeysInputIteratorT>;
+  using InputT             = value_t<ValuesInputIteratorT>;
   using FlagValuePairT     = KeyValuePair<int, AccumT>;
-  using ReduceBySegmentOpT = detail::ScanBySegmentOp<ScanOpT>;
+  using ReduceBySegmentOpT = ScanBySegmentOp<ScanOpT>;
 
   using ScanTileStateT = ReduceByKeyScanTileState<AccumT, int>;
 
@@ -333,7 +338,7 @@ struct AgentScanByKey
       BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (IS_LAST_TILE)
     {
@@ -347,7 +352,7 @@ struct AgentScanByKey
       BlockLoadValuesT(storage.load_values).Load(d_values_in + tile_base, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // first tile
     if (tile_idx == 0)
@@ -386,7 +391,7 @@ struct AgentScanByKey
       ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     UnzipValues(values, scan_items);
 
@@ -460,4 +465,29 @@ struct AgentScanByKey
   }
 };
 
+} // namespace scan_by_key
+} // namespace detail
+
+template <typename AgentScanByKeyPolicyT,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename ValuesOutputIteratorT,
+          typename EqualityOp,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename AccumT>
+using AgentScanByKey CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                             "interface will be removed.") =
+  detail::scan_by_key::AgentScanByKey<
+    AgentScanByKeyPolicyT,
+    KeysInputIteratorT,
+    ValuesInputIteratorT,
+    ValuesOutputIteratorT,
+    EqualityOp,
+    ScanOpT,
+    InitValueT,
+    OffsetT,
+    AccumT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh
index 1cf5eff5008..caabf774ba8 100644
--- a/cub/cub/agent/agent_segment_fixup.cuh
+++ b/cub/cub/agent/agent_segment_fixup.cuh
@@ -110,6 +110,11 @@ struct AgentSegmentFixupPolicy
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace segment_fixup
+{
+
 /**
  * @brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for
  * participating in device-wide reduce-value-by-key
@@ -145,7 +150,7 @@ struct AgentSegmentFixup
   //---------------------------------------------------------------------
 
   // Data type of key-value input iterator
-  using KeyValuePairT = cub::detail::value_t<PairsInputIteratorT>;
+  using KeyValuePairT = value_t<PairsInputIteratorT>;
 
   // Value type
   using ValueT = typename KeyValuePairT::Value;
@@ -376,7 +381,7 @@ struct AgentSegmentFixup
       BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     KeyValuePairT tile_aggregate;
     if (tile_idx == 0)
@@ -468,4 +473,23 @@ struct AgentSegmentFixup
   }
 };
 
+} // namespace segment_fixup
+} // namespace detail
+
+template <typename AgentSegmentFixupPolicyT,
+          typename PairsInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT>
+using AgentSegmentFixup CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                                "interface will be removed.") =
+  detail::segment_fixup::AgentSegmentFixup<
+    AgentSegmentFixupPolicyT,
+    PairsInputIteratorT,
+    AggregatesOutputIteratorT,
+    EqualityOpT,
+    ReductionOpT,
+    OffsetT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_segmented_radix_sort.cuh b/cub/cub/agent/agent_segmented_radix_sort.cuh
index fe687fa9f51..e8921aaf045 100644
--- a/cub/cub/agent/agent_segmented_radix_sort.cuh
+++ b/cub/cub/agent/agent_segmented_radix_sort.cuh
@@ -45,6 +45,11 @@
 
 CUB_NAMESPACE_BEGIN
 
+namespace detail
+{
+namespace radix_sort
+{
+
 /**
  * This agent will be implementing the `DeviceSegmentedRadixSort` when the
  * https://github.com/NVIDIA/cub/issues/383 is addressed.
@@ -69,7 +74,7 @@ template <bool IS_DESCENDING,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
-          typename DecomposerT = detail::identity_decomposer_t>
+          typename DecomposerT = identity_decomposer_t>
 struct AgentSegmentedRadixSort
 {
   OffsetT num_items;
@@ -80,7 +85,7 @@ struct AgentSegmentedRadixSort
   static constexpr int RADIX_DIGITS     = 1 << RADIX_BITS;
   static constexpr int KEYS_ONLY        = std::is_same<ValueT, NullType>::value;
 
-  using traits           = detail::radix::traits_t<KeyT>;
+  using traits           = radix::traits_t<KeyT>;
   using bit_ordered_type = typename traits::bit_ordered_type;
 
   // Huge segment handlers
@@ -154,13 +159,13 @@ struct AgentSegmentedRadixSort
     {
       BlockValueLoadT(temp_storage.values_load).Load(d_values_in, thread_values, num_items);
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
     {
       BlockKeyLoadT(temp_storage.keys_load).Load(d_keys_in, thread_keys, num_items, oob_default);
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
     BlockRadixSortT(temp_storage.sort)
@@ -187,13 +192,13 @@ struct AgentSegmentedRadixSort
     BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer);
     upsweep.ProcessRegion(OffsetT{}, num_items);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
     OffsetT bin_count[BINS_TRACKED_PER_THREAD];
     upsweep.ExtractCounts(bin_count);
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (IS_DESCENDING)
     {
@@ -209,7 +214,7 @@ struct AgentSegmentedRadixSort
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -243,7 +248,7 @@ struct AgentSegmentedRadixSort
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -257,7 +262,7 @@ struct AgentSegmentedRadixSort
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Downsweep
     BlockDownsweepT downsweep(
@@ -275,4 +280,18 @@ struct AgentSegmentedRadixSort
   }
 };
 
+} // namespace radix_sort
+} // namespace detail
+
+template <bool IS_DESCENDING,
+          typename SegmentedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+using AgentSegmentedRadixSort CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public "
+  "interface will be removed.") =
+  detail::radix_sort::AgentSegmentedRadixSort<IS_DESCENDING, SegmentedPolicyT, KeyT, ValueT, OffsetT, DecomposerT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 4f16992b276..37e7b838adf 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -123,6 +123,9 @@ struct AgentSelectIfPolicy
 
 namespace detail
 {
+namespace select
+{
+
 template <typename SelectedOutputItT, typename RejectedOutputItT>
 struct partition_distinct_output_t
 {
@@ -132,7 +135,6 @@ struct partition_distinct_output_t
   selected_iterator_t selected_it;
   rejected_iterator_t rejected_it;
 };
-} // namespace detail
 
 /**
  * @brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in
@@ -210,13 +212,13 @@ struct AgentSelectIf
 
   // If we need to enforce memory order for in-place stream compaction, wrap the default decoupled look-back tile
   // state in a helper class that enforces memory order on reads and writes
-  using MemoryOrderedTileStateT = detail::tile_state_with_memory_order<ScanTileStateT, memory_order>;
+  using MemoryOrderedTileStateT = tile_state_with_memory_order<ScanTileStateT, memory_order>;
 
   // The input value type
-  using InputT = cub::detail::value_t<InputIteratorT>;
+  using InputT = value_t<InputIteratorT>;
 
   // The flag value type
-  using FlagT = cub::detail::value_t<FlagsInputIteratorT>;
+  using FlagT = value_t<FlagsInputIteratorT>;
 
   // Constants
   enum
@@ -408,7 +410,7 @@ struct AgentSelectIf
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
     Int2Type<USE_STENCIL_WITH_OP> /*select_method*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     FlagT flags[ITEMS_PER_THREAD];
     if (IS_LAST_TILE)
@@ -450,7 +452,7 @@ struct AgentSelectIf
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
     Int2Type<USE_SELECT_FLAGS> /*select_method*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     FlagT flags[ITEMS_PER_THREAD];
 
@@ -486,7 +488,7 @@ struct AgentSelectIf
   {
     if (IS_FIRST_TILE && streaming_context.is_first_partition())
     {
-      CTA_SYNC();
+      __syncthreads();
 
       // Set head selection_flags.  First tile sets the first flag for the first item
       BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
@@ -499,7 +501,7 @@ struct AgentSelectIf
         tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1];
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       BlockDiscontinuityT(temp_storage.scan_storage.discontinuity)
         .FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
@@ -571,7 +573,7 @@ struct AgentSelectIf
     int num_tile_selections,
     OffsetT num_selections_prefix)
   {
-    CTA_SYNC();
+    __syncthreads();
 
 // Compact and scatter items
 #pragma unroll
@@ -584,7 +586,7 @@ struct AgentSelectIf
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
     {
@@ -667,7 +669,7 @@ struct AgentSelectIf
     OffsetT num_selections,
     Int2Type<true> /*is_keep_rejects*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     int tile_num_rejections = num_tile_items - num_tile_selections;
 
@@ -685,7 +687,7 @@ struct AgentSelectIf
     }
 
     // Ensure all threads finished scattering to shared memory
-    CTA_SYNC();
+    __syncthreads();
 
     // Gather items from shared memory and scatter to global
     ScatterPartitionsToGlobal<IS_LAST_TILE>(
@@ -702,7 +704,7 @@ struct AgentSelectIf
     int tile_num_rejections,
     OffsetT num_selections_prefix,
     OffsetT num_rejected_prefix,
-    detail::partition_distinct_output_t<SelectedItT, RejectedItT> partitioned_out_wrapper)
+    partition_distinct_output_t<SelectedItT, RejectedItT> partitioned_out_wrapper)
   {
     auto selected_out_it = partitioned_out_wrapper.selected_it + streaming_context.num_previously_selected();
     auto rejected_out_it = partitioned_out_wrapper.rejected_it + streaming_context.num_previously_rejected();
@@ -814,7 +816,7 @@ struct AgentSelectIf
     // Ensure temporary storage used during block load can be reused
     // Also, in case of in-place stream compaction, this is needed to order the loads of
     // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of selection_flags
     OffsetT num_tile_selections;
@@ -894,7 +896,7 @@ struct AgentSelectIf
     // Ensure temporary storage used during block load can be reused
     // Also, in case of in-place stream compaction, this is needed to order the loads of
     // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of values and selection_flags
     TilePrefixCallbackOpT prefix_op(
@@ -1014,4 +1016,36 @@ struct AgentSelectIf
   }
 };
 
+} // namespace select
+} // namespace detail
+
+template <typename SelectedOutputItT, typename RejectedOutputItT>
+using partition_distinct_output_t CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the "
+                                                          "public interface will be removed.") =
+  detail::select::partition_distinct_output_t<SelectedOutputItT, RejectedOutputItT>;
+
+template <typename AgentSelectIfPolicyT,
+          typename InputIteratorT,
+          typename FlagsInputIteratorT,
+          typename OutputIteratorWrapperT,
+          typename SelectOpT,
+          typename EqualityOpT,
+          typename OffsetT,
+          typename StreamingContextT,
+          bool KeepRejects,
+          bool MayAlias>
+using AgentSelectIf CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                            "interface will be removed.") =
+  detail::select::AgentSelectIf<
+    AgentSelectIfPolicyT,
+    InputIteratorT,
+    FlagsInputIteratorT,
+    OutputIteratorWrapperT,
+    SelectOpT,
+    EqualityOpT,
+    OffsetT,
+    StreamingContextT,
+    KeepRejects,
+    MayAlias>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
index 41c40bee28e..80d571d58db 100644
--- a/cub/cub/agent/agent_spmv_orig.cuh
+++ b/cub/cub/agent/agent_spmv_orig.cuh
@@ -52,6 +52,8 @@
 #include <cub/thread/thread_search.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 #include <iterator>
@@ -102,7 +104,7 @@ template <int _BLOCK_THREADS,
           CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER,
           bool _DIRECT_LOAD_NONZEROS,
           BlockScanAlgorithm _SCAN_ALGORITHM>
-struct AgentSpmvPolicy
+struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmvPolicy
 {
   enum
   {
@@ -148,7 +150,12 @@ struct AgentSpmvPolicy
  *   Signed integer type for sequence offsets
  */
 template <typename ValueT, typename OffsetT>
-struct SpmvParams
+struct
+// with NVHPC, we get a deprecation warning in the implementation of cudaLaunchKernelEx, which we cannot suppress :/
+#if !_CCCL_COMPILER(NVHPC)
+  CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
+#endif
+    SpmvParams
 {
   /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix
   /// <b>A</b>.
@@ -211,7 +218,7 @@ template <typename AgentSpmvPolicyT,
           bool HAS_ALPHA,
           bool HAS_BETA,
           int LEGACY_PTX_ARCH = 0>
-struct AgentSpmv
+struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
 {
   //---------------------------------------------------------------------
   // Types and constants
@@ -308,7 +315,9 @@ struct AgentSpmv
   /// Reference to temp_storage
   _TempStorage& temp_storage;
 
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   SpmvParams<ValueT, OffsetT>& spmv_params;
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements
   /// of matrix <b>A</b>.
@@ -341,6 +350,7 @@ struct AgentSpmv
    * @param spmv_params
    *   SpMV input parameter bundle
    */
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams<ValueT, OffsetT>& spmv_params)
       : temp_storage(temp_storage.Alias())
       , spmv_params(spmv_params)
@@ -350,6 +360,7 @@ struct AgentSpmv
       , wd_vector_x(spmv_params.d_vector_x)
       , wd_vector_y(spmv_params.d_vector_y)
   {}
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   /**
    * @brief Consume a merge tile, specialized for direct-load of nonzeros
@@ -367,12 +378,12 @@ struct AgentSpmv
     // Gather the row end-offsets for the merge tile into shared memory
     for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
     {
-      const OffsetT offset =
-        (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
+      const OffsetT offset = (::cuda::std::min)(
+        static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
       s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Search for the thread's starting coordinate within the merge tile
     CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
@@ -386,7 +397,7 @@ struct AgentSpmv
       tile_num_nonzeros,
       thread_start_coord);
 
-    CTA_SYNC(); // Perf-sync
+    __syncthreads(); // Perf-sync
 
     // Compute the thread's merge path segment
     CoordinateT thread_current_coord = thread_start_coord;
@@ -425,7 +436,7 @@ struct AgentSpmv
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Block-wide reduce-value-by-segment
     KeyValuePairT tile_carry;
@@ -548,12 +559,12 @@ struct AgentSpmv
 #pragma unroll 1
     for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
     {
-      const OffsetT offset =
-        (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
+      const OffsetT offset = (::cuda::std::min)(
+        static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
       s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Search for the thread's starting coordinate within the merge tile
     CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
@@ -567,7 +578,7 @@ struct AgentSpmv
       tile_num_nonzeros,
       thread_start_coord);
 
-    CTA_SYNC(); // Perf-sync
+    __syncthreads(); // Perf-sync
 
     // Compute the thread's merge path segment
     CoordinateT thread_current_coord = thread_start_coord;
@@ -600,7 +611,7 @@ struct AgentSpmv
       scan_segment[ITEM].key = thread_current_coord.x;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Block-wide reduce-value-by-segment
     KeyValuePairT tile_carry;
@@ -620,7 +631,7 @@ struct AgentSpmv
 
     if (tile_num_rows > 0)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       // Scan downsweep and scatter
       ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
@@ -647,7 +658,7 @@ struct AgentSpmv
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll 1
       for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
@@ -709,7 +720,7 @@ struct AgentSpmv
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     CoordinateT tile_start_coord = temp_storage.tile_coords[0];
     CoordinateT tile_end_coord   = temp_storage.tile_coords[1];
diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
index f07b2173cdc..b10f1cda3ea 100644
--- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh
+++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -77,6 +77,11 @@ struct AgentSmallAndMediumSegmentedSortPolicy
   static constexpr int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS;
 };
 
+namespace detail
+{
+namespace sub_warp_merge_sort
+{
+
 /**
  * @brief AgentSubWarpSort implements a sub-warp merge sort.
  *
@@ -168,8 +173,8 @@ class AgentSubWarpSort
     // LOWEST   -> -nan          = 11...11b -> TwiddleIn ->  0 = 00...00b
 
     // Segmented sort doesn't support custom types at the moment.
-    bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(detail::identity_decomposer_t{})
-                                                      : traits::max_raw_binary_key(detail::identity_decomposer_t{});
+    bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(identity_decomposer_t{})
+                                                      : traits::max_raw_binary_key(identity_decomposer_t{});
     return reinterpret_cast<KeyT&>(default_key_bits);
   }
 
@@ -233,23 +238,23 @@ public:
       KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type<std::is_same<bool, KeyT>::value>{});
 
       WarpLoadKeysT(storage.load_keys).Load(keys_input, keys, segment_size, oob_default);
-      WARP_SYNC(warp_merge_sort.get_member_mask());
+      __syncwarp(warp_merge_sort.get_member_mask());
 
       if (!KEYS_ONLY)
       {
         WarpLoadItemsT(storage.load_items).Load(values_input, values, segment_size);
 
-        WARP_SYNC(warp_merge_sort.get_member_mask());
+        __syncwarp(warp_merge_sort.get_member_mask());
       }
 
       warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default);
-      WARP_SYNC(warp_merge_sort.get_member_mask());
+      __syncwarp(warp_merge_sort.get_member_mask());
 
       WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size);
 
       if (!KEYS_ONLY)
       {
-        WARP_SYNC(warp_merge_sort.get_member_mask());
+        __syncwarp(warp_merge_sort.get_member_mask());
         WarpStoreItemsT(storage.store_items).Store(values_output, values, segment_size);
       }
     }
@@ -331,4 +336,12 @@ private:
   }
 };
 
+} // namespace sub_warp_merge_sort
+} // namespace detail
+
+template <bool IS_DESCENDING, typename PolicyT, typename KeyT, typename ValueT, typename OffsetT>
+using AgentSubWarpSort CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                               "interface will be removed.") =
+  detail::sub_warp_merge_sort::AgentSubWarpSort<IS_DESCENDING, PolicyT, KeyT, ValueT, OffsetT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh
index eec24057163..047861254ac 100644
--- a/cub/cub/agent/agent_three_way_partition.cuh
+++ b/cub/cub/agent/agent_three_way_partition.cuh
@@ -56,6 +56,26 @@ CUB_NAMESPACE_BEGIN
  * Tuning policy types
  ******************************************************************************/
 
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          class DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
+struct AgentThreeWayPartitionPolicy
+{
+  static constexpr int BLOCK_THREADS                 = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD              = _ITEMS_PER_THREAD;
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+  static constexpr CacheLoadModifier LOAD_MODIFIER   = _LOAD_MODIFIER;
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
 namespace detail
 {
 
@@ -135,30 +155,6 @@ struct accumulator_pack_t : accumulator_pack_base_t<OffsetT>
   }
 };
 
-} // namespace three_way_partition
-
-} // namespace detail
-
-template <int _BLOCK_THREADS,
-          int _ITEMS_PER_THREAD,
-          BlockLoadAlgorithm _LOAD_ALGORITHM,
-          CacheLoadModifier _LOAD_MODIFIER,
-          BlockScanAlgorithm _SCAN_ALGORITHM,
-          class DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
-struct AgentThreeWayPartitionPolicy
-{
-  static constexpr int BLOCK_THREADS                 = _BLOCK_THREADS;
-  static constexpr int ITEMS_PER_THREAD              = _ITEMS_PER_THREAD;
-  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
-  static constexpr CacheLoadModifier LOAD_MODIFIER   = _LOAD_MODIFIER;
-  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
-
-  struct detail
-  {
-    using delay_constructor_t = DelayConstructorT;
-  };
-};
-
 /**
  * \brief Implements a device-wide three-way partitioning
  *
@@ -184,9 +180,9 @@ struct AgentThreeWayPartition
   //---------------------------------------------------------------------
 
   // The input value type
-  using InputT = cub::detail::value_t<InputIteratorT>;
+  using InputT = value_t<InputIteratorT>;
 
-  using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t<OffsetT>;
+  using AccumPackHelperT = accumulator_pack_t<OffsetT>;
   using AccumPackT       = typename AccumPackHelperT::pack_t;
 
   // Tile status descriptor interface type
@@ -313,7 +309,7 @@ struct AgentThreeWayPartition
     AccumPackT num_tile_selected_prefix,
     OffsetT num_rejected_prefix)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     const OffsetT num_first_selections_prefix  = AccumPackHelperT::first(num_tile_selected_prefix);
     const OffsetT num_second_selections_prefix = AccumPackHelperT::second(num_tile_selected_prefix);
@@ -353,7 +349,7 @@ struct AgentThreeWayPartition
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Gather items from shared memory and scatter to global
     auto first_base =
@@ -421,7 +417,7 @@ struct AgentThreeWayPartition
 
     // Initialize selection_flags
     Initialize<IS_LAST_TILE>(num_tile_items, items, items_selection_flags);
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of selection_flags
     BlockScanT(temp_storage.scan_storage.scan)
@@ -486,7 +482,7 @@ struct AgentThreeWayPartition
 
     // Initialize selection_flags
     Initialize<IS_LAST_TILE>(num_tile_items, items, items_selected_flags);
-    CTA_SYNC();
+    __syncthreads();
 
     // Exclusive scan of values and selection_flags
     TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx);
@@ -497,7 +493,7 @@ struct AgentThreeWayPartition
     AccumPackT num_items_in_tile_selected = prefix_op.GetBlockAggregate();
     AccumPackT num_items_selected_prefix  = prefix_op.GetExclusivePrefix();
 
-    CTA_SYNC();
+    __syncthreads();
 
     OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - AccumPackHelperT::sum(num_items_selected_prefix);
 
@@ -593,4 +589,7 @@ struct AgentThreeWayPartition
   }
 };
 
+} // namespace three_way_partition
+} // namespace detail
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh
index 30f5d4f50e4..a1a731f150f 100644
--- a/cub/cub/agent/agent_unique_by_key.cuh
+++ b/cub/cub/agent/agent_unique_by_key.cuh
@@ -92,6 +92,11 @@ struct AgentUniqueByKeyPolicy
  * Thread block abstractions
  ******************************************************************************/
 
+namespace detail
+{
+namespace unique_by_key
+{
+
 /**
  * @brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating
  * in device-wide unique-by-key
@@ -286,7 +291,7 @@ struct AgentUniqueByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
 // types
@@ -296,7 +301,7 @@ struct AgentUniqueByKey
       items_out[num_selections_prefix + item] = GetShared(tag)[item];
     }
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   //---------------------------------------------------------------------
@@ -337,7 +342,7 @@ struct AgentUniqueByKey
       BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     ValueT values[ITEMS_PER_THREAD];
     if (IS_LAST_TILE)
@@ -352,7 +357,7 @@ struct AgentUniqueByKey
       BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, keys, inequality_op);
 #pragma unroll
@@ -365,7 +370,7 @@ struct AgentUniqueByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     OffsetT num_tile_selections   = 0;
     OffsetT num_selections        = 0;
@@ -390,7 +395,7 @@ struct AgentUniqueByKey
     }
     num_selections = num_tile_selections;
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(KeyTagT(),
             d_keys_out,
@@ -402,7 +407,7 @@ struct AgentUniqueByKey
             num_selections_prefix,
             num_selections);
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(ValueTagT(),
             d_values_out,
@@ -454,7 +459,7 @@ struct AgentUniqueByKey
       BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     ValueT values[ITEMS_PER_THREAD];
     if (IS_LAST_TILE)
@@ -469,7 +474,7 @@ struct AgentUniqueByKey
       BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     KeyT tile_predecessor = d_keys_in[tile_offset - 1];
     BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
@@ -485,7 +490,7 @@ struct AgentUniqueByKey
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     OffsetT num_tile_selections   = 0;
     OffsetT num_selections        = 0;
@@ -505,7 +510,7 @@ struct AgentUniqueByKey
       num_selections -= num_discount;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(KeyTagT(),
             d_keys_out,
@@ -517,7 +522,7 @@ struct AgentUniqueByKey
             num_selections_prefix,
             num_selections);
 
-    CTA_SYNC();
+    __syncthreads();
 
     Scatter(ValueTagT(),
             d_values_out,
@@ -606,4 +611,25 @@ struct AgentUniqueByKey
   }
 };
 
+} // namespace unique_by_key
+} // namespace detail
+
+template <typename AgentUniqueByKeyPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT>
+using AgentUniqueByKey CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
+                                               "interface will be removed.") =
+  detail::unique_by_key::AgentUniqueByKey<
+    AgentUniqueByKeyPolicyT,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyOutputIteratorT,
+    ValueOutputIteratorT,
+    EqualityOpT,
+    OffsetT>;
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index 71469a0055a..bd6551b8f8d 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -733,7 +733,7 @@ public:
       tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
     }
 
-    while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff))
+    while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID)))
     {
       delay_or_prevent_hoisting();
       TxnWord alias   = LoadStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
@@ -918,7 +918,7 @@ struct ScanTileState<T, false>
       delay();
       status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx);
       __threadfence();
-    } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff));
+    } while (__any_sync(0xffffffff, (status == SCAN_TILE_INVALID)));
 
     if (status == StatusWord(SCAN_TILE_PARTIAL))
     {
@@ -1145,7 +1145,7 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
       TxnWord alias   = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
       tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
 
-    } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+    } while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID)));
 
     status      = tile_descriptor.status;
     value.value = tile_descriptor.value;
@@ -1268,7 +1268,7 @@ struct TilePrefixCallbackOp
     exclusive_prefix = window_aggregate;
 
     // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-    while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+    while (__all_sync(0xffffffff, (predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE))))
     {
       predecessor_idx -= CUB_PTX_WARP_THREADS;
 
diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index 5bc3bae3219..38636571e80 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -309,7 +309,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
@@ -408,7 +408,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
@@ -499,7 +499,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
     {
@@ -622,7 +622,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
     {
@@ -736,7 +736,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
@@ -837,7 +837,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for last thread-item
     T successor_item = (linear_tid == BLOCK_THREADS - 1)
@@ -926,7 +926,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items)
     {
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index fb88dfac07f..e4998f32510 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -292,7 +292,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     if (linear_tid == 0)
     {
@@ -337,7 +337,7 @@ public:
     // Share last item
     temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for first thread-item
     preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
@@ -586,7 +586,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for last thread-item
     tail_flags[ITEMS_PER_THREAD - 1] =
@@ -686,7 +686,7 @@ public:
     // Share first item
     temp_storage.first_items[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Set flag for last thread-item
     T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
@@ -790,7 +790,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
@@ -920,7 +920,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
@@ -1052,7 +1052,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
@@ -1189,7 +1189,7 @@ public:
     temp_storage.first_items[linear_tid] = input[0];
     temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
     T preds[ITEMS_PER_THREAD];
 
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index bdc2a3dc932..d1ae91c223d 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -47,6 +47,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_exchange.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -179,7 +181,7 @@ private:
 
   // TODO(bgruber): can we use signed int here? Only these variables are unsigned:
   unsigned int linear_tid  = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-  unsigned int lane_id     = LaneId();
+  unsigned int lane_id     = ::cuda::ptx::get_sreg_laneid();
   unsigned int warp_id     = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
   unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
 
@@ -215,7 +217,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -249,7 +251,7 @@ private:
       const int slice_offset = slice * TIME_SLICED_ITEMS;
       const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -265,7 +267,7 @@ private:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -322,7 +324,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    WARP_SYNC(0xffffffff);
+    __syncwarp(0xffffffff);
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -361,7 +363,7 @@ private:
         detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
       }
 
-      WARP_SYNC(0xffffffff);
+      __syncwarp(0xffffffff);
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -378,7 +380,7 @@ private:
 #pragma unroll
     for (int slice = 1; slice < TIME_SLICES; ++slice)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -393,7 +395,7 @@ private:
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
-        WARP_SYNC(0xffffffff);
+        __syncwarp(0xffffffff);
 
 #pragma unroll
         for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -434,7 +436,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 // No timeslicing
 #pragma unroll
@@ -470,7 +472,7 @@ private:
       const int slice_offset = slice * TIME_SLICED_ITEMS;
       const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -493,7 +495,7 @@ private:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -543,7 +545,7 @@ private:
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    WARP_SYNC(0xffffffff);
+    __syncwarp(0xffffffff);
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -572,7 +574,7 @@ private:
 #pragma unroll
     for (int slice = 0; slice < TIME_SLICES; ++slice)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -587,7 +589,7 @@ private:
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
-        WARP_SYNC(0xffffffff);
+        __syncwarp(0xffffffff);
 
 #pragma unroll
         for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -626,12 +628,12 @@ private:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -639,7 +641,7 @@ private:
       int item_offset = linear_tid * ITEMS_PER_THREAD + i;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
@@ -667,7 +669,7 @@ private:
 #pragma unroll
     for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      CTA_SYNC();
+      __syncthreads();
 
       const int slice_offset = TIME_SLICED_ITEMS * slice;
 
@@ -679,13 +681,13 @@ private:
         {
           _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
-            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
           }
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       if (warp_id == slice)
       {
@@ -695,7 +697,7 @@ private:
           int item_offset = lane_id * ITEMS_PER_THREAD + i;
           _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
-            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
           }
           temp_items[i] = temp_storage.buff[item_offset];
         }
@@ -733,12 +735,12 @@ private:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -746,7 +748,7 @@ private:
       int item_offset = i * BLOCK_THREADS + linear_tid;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
@@ -777,7 +779,7 @@ private:
       const int slice_offset = slice * TIME_SLICED_ITEMS;
       const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -787,13 +789,13 @@ private:
         {
           _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
-            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
           }
           detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -1134,7 +1136,7 @@ public:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       if (ranks[i] >= 0)
       {
@@ -1142,7 +1144,7 @@ public:
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -1150,7 +1152,7 @@ public:
       int item_offset = i * BLOCK_THREADS + linear_tid;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
@@ -1193,7 +1195,7 @@ public:
       int item_offset = ranks[i];
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       if (is_valid[i])
       {
@@ -1201,7 +1203,7 @@ public:
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -1209,7 +1211,7 @@ public:
       int item_offset = i * BLOCK_THREADS + linear_tid;
       _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
       output_items[i] = temp_storage.buff[item_offset];
     }
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index d5726f240f6..41abbd588b3 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -202,8 +202,8 @@ private:
   /// Internal specialization.
   using InternalBlockHistogram =
     ::cuda::std::_If<ALGORITHM == BLOCK_HISTO_SORT,
-                     BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
-                     BlockHistogramAtomic<BINS>>;
+                     detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
+                     detail::BlockHistogramAtomic<BINS>>;
 
   /// Shared memory storage layout type for BlockHistogram
   using _TempStorage = typename InternalBlockHistogram::TempStorage;
@@ -358,7 +358,7 @@ public:
     // Initialize histogram bin counts to zeros
     InitHistogram(histogram);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Composite the histogram
     InternalBlockHistogram(temp_storage).Composite(items, histogram);
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index b6d0c8a33b1..3ade5eb1609 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -43,6 +43,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -58,7 +60,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
 MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred)
 {
   OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
-  OffsetT keys1_end   = (cub::min)(diag, keys1_count);
+  OffsetT keys1_end   = (::cuda::std::min)(diag, keys1_count);
 
   while (keys1_begin < keys1_end)
   {
@@ -425,12 +427,12 @@ public:
 
       int thread_idx_in_thread_group_being_merged = mask & linear_tid;
 
-      int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
+      int diag = (::cuda::std::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
 
-      int keys1_beg = (cub::min)(valid_items, start);
-      int keys1_end = (cub::min)(valid_items, keys1_beg + size);
+      int keys1_beg = (::cuda::std::min)(valid_items, start);
+      int keys1_end = (::cuda::std::min)(valid_items, keys1_beg + size);
       int keys2_beg = keys1_end;
-      int keys2_end = (cub::min)(valid_items, keys2_beg + size);
+      int keys2_end = (::cuda::std::min)(valid_items, keys2_beg + size);
 
       int keys1_count = keys1_end - keys1_beg;
       int keys2_count = keys2_end - keys2_beg;
@@ -760,7 +762,7 @@ public:
 private:
   _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const
   {
-    CTA_SYNC();
+    __syncthreads();
   }
 
   friend BlockMergeSortStrategyT;
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 92605b5168d..ad495e1db31 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -48,6 +48,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -477,12 +478,12 @@ public:
       *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Scan shared memory counters
     ScanCounters();
 
-    CTA_SYNC();
+    __syncthreads();
 
 // Extract the local ranks of each key
 #pragma unroll
@@ -710,13 +711,13 @@ public:
       temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Each warp will strip-mine its section of input, one strip at a time
 
     volatile DigitCounterT* digit_counters[KEYS_PER_THREAD];
     uint32_t warp_id      = linear_tid >> LOG_WARP_THREADS;
-    uint32_t lane_mask_lt = LaneMaskLt();
+    uint32_t lane_mask_lt = ::cuda::ptx::get_sreg_lanemask_lt();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
@@ -740,7 +741,7 @@ public:
       DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
 
       // Warp-sync
-      WARP_SYNC(0xFFFFFFFF);
+      __syncwarp(0xFFFFFFFF);
 
       // Number of peers having same digit as me
       int32_t digit_count = __popc(peer_mask);
@@ -755,13 +756,13 @@ public:
       }
 
       // Warp-sync
-      WARP_SYNC(0xFFFFFFFF);
+      __syncwarp(0xFFFFFFFF);
 
       // Number of prior keys having same digit
       ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Scan warp counters
 
@@ -781,7 +782,7 @@ public:
       temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
     }
 
-    CTA_SYNC();
+    __syncthreads();
     if (!::cuda::std::is_same<CountsCallback, BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>>::value)
     {
       CallBack<KEYS_PER_THREAD>(callback);
@@ -977,7 +978,7 @@ struct BlockRadixRankMatchEarlyCounts
           match_masks[bin] = 0;
         }
       }
-      WARP_SYNC(WARP_MASK);
+      __syncwarp(WARP_MASK);
 
       // compute private per-part histograms
       int part = lane % NUM_PARTS;
@@ -991,7 +992,7 @@ struct BlockRadixRankMatchEarlyCounts
       // no extra work is necessary if NUM_PARTS == 1
       if (NUM_PARTS > 1)
       {
-        WARP_SYNC(WARP_MASK);
+        __syncwarp(WARP_MASK);
         // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary
         constexpr int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS;
         int bins[WARP_BINS_PER_THREAD];
@@ -1001,7 +1002,7 @@ struct BlockRadixRankMatchEarlyCounts
           int bin = lane + u * WARP_THREADS;
           bins[u] = cub::ThreadReduce(warp_histograms[bin], ::cuda::std::plus<>{});
         }
-        CTA_SYNC();
+        __syncthreads();
 
         // store the resulting histogram in shared memory
         int* warp_offsets = &s.warp_offsets[warp][0];
@@ -1066,22 +1067,22 @@ struct BlockRadixRankMatchEarlyCounts
         ::cuda::std::uint32_t bin = Digit(keys[u]);
         int* p_match_mask         = &match_masks[bin];
         atomicOr(p_match_mask, lane_mask);
-        WARP_SYNC(WARP_MASK);
+        __syncwarp(WARP_MASK);
         int bin_mask    = *p_match_mask;
         int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
         int warp_offset = 0;
-        int popc        = __popc(bin_mask & LaneMaskLe());
+        int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)
         {
           // atomic is a bit faster
           warp_offset = atomicAdd(&warp_offsets[bin], popc);
         }
-        warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK);
+        warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader);
         if (lane == leader)
         {
           *p_match_mask = 0;
         }
-        WARP_SYNC(WARP_MASK);
+        __syncwarp(WARP_MASK);
         ranks[u] = warp_offset + popc - 1;
       }
     }
@@ -1099,13 +1100,13 @@ struct BlockRadixRankMatchEarlyCounts
           detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
         int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
         int warp_offset = 0;
-        int popc        = __popc(bin_mask & LaneMaskLe());
+        int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)
         {
           // atomic is a bit faster
           warp_offset = atomicAdd(&warp_offsets[bin], popc);
         }
-        warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK);
+        warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader);
         ranks[u]    = warp_offset + popc - 1;
       }
     }
@@ -1117,7 +1118,7 @@ struct BlockRadixRankMatchEarlyCounts
     {
       ComputeHistogramsWarp(keys);
 
-      CTA_SYNC();
+      __syncthreads();
       int bins[BINS_PER_THREAD];
       ComputeOffsetsWarpUpsweep(bins);
       callback(bins);
@@ -1125,7 +1126,7 @@ struct BlockRadixRankMatchEarlyCounts
       BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix);
 
       ComputeOffsetsWarpDownsweep(exclusive_digit_prefix);
-      CTA_SYNC();
+      __syncthreads();
       ComputeRanksItem(keys, ranks, Int2Type<MATCH_ALGORITHM>());
     }
 
@@ -1135,7 +1136,7 @@ struct BlockRadixRankMatchEarlyCounts
         , digit_extractor(digit_extractor)
         , callback(callback)
         , warp(threadIdx.x / WARP_THREADS)
-        , lane(LaneId())
+        , lane(::cuda::ptx::get_sreg_laneid())
     {}
   };
 
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index 3223b920b13..080053348d7 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -364,7 +364,7 @@ private:
     Int2Type<false> /*is_keys_only*/,
     Int2Type<true> /*is_blocked*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     // Exchange values through shared memory in blocked arrangement
     BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
@@ -377,7 +377,7 @@ private:
     Int2Type<false> /*is_keys_only*/,
     Int2Type<false> /*is_blocked*/)
   {
-    CTA_SYNC();
+    __syncthreads();
 
     // Exchange values through shared memory in blocked arrangement
     BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
@@ -443,7 +443,7 @@ private:
       RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
       begin_bit += RADIX_BITS;
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Exchange keys through shared memory in blocked arrangement
       BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
@@ -457,7 +457,7 @@ private:
         break;
       }
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
 // Untwiddle bits if necessary
@@ -522,7 +522,7 @@ public:
       RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
       begin_bit += RADIX_BITS;
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Check if this is the last pass
       if (begin_bit >= end_bit)
@@ -543,7 +543,7 @@ public:
       // Exchange values through shared memory in blocked arrangement
       ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
 
-      CTA_SYNC();
+      __syncthreads();
     }
 
 // Untwiddle bits if necessary
diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh
index 6828f6fa706..6cf578963fc 100644
--- a/cub/cub/block/block_reduce.cuh
+++ b/cub/cub/block/block_reduce.cuh
@@ -250,9 +250,9 @@ private:
     BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
   };
 
-  using WarpReductions        = BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-  using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-  using Raking                = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using WarpReductions        = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using RakingCommutativeOnly = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using Raking                = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
   /// Internal specialization type
   using InternalBlockReduce =
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index 0dca0a5d838..467d9141dc3 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -44,6 +44,9 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 #include <limits>
 #include <type_traits>
 
@@ -284,7 +287,7 @@ private:
     for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
     {
       OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
-      mid         = (cub::min)(mid, num_items - 1);
+      mid         = (::cuda::std::min)(mid, num_items - 1);
 
       if (val < input[mid])
       {
@@ -314,7 +317,7 @@ private:
     }
 
     // Ensure run offsets and run values have been written to shared memory
-    CTA_SYNC();
+    __syncthreads();
   }
 
   template <typename RunLengthT, typename TotalDecodedSizeT>
@@ -335,7 +338,7 @@ private:
     total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
 
     // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
-    CTA_SYNC();
+    __syncthreads();
 
     InitWithRunOffsets(run_values, run_offsets);
   }
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index c49eb36a52e..c25bd2d258d 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -250,9 +250,9 @@ private:
       ? BLOCK_SCAN_RAKING
       : ALGORITHM;
 
-  using WarpScans = BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
   using Raking =
-    BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
+    detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
 
   /// Define the delegate type for the desired algorithm
   using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
@@ -477,7 +477,7 @@ public:
   //!            // Collectively compute the block-wide exclusive prefix sum
   //!            BlockScan(temp_storage).ExclusiveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -714,17 +714,17 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide exclusive prefix sum
   //!            int block_aggregate;
   //!            BlockScan(temp_storage.scan).ExclusiveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
@@ -957,7 +957,7 @@ public:
   //!            // Collectively compute the block-wide exclusive prefix max scan
   //!            BlockScan(temp_storage).ExclusiveScan(
   //!                thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -1230,16 +1230,16 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide exclusive prefix max scan
   //!            BlockScan(temp_storage.scan).ExclusiveScan(
   //!                thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
@@ -1618,7 +1618,7 @@ public:
   //!            // Collectively compute the block-wide inclusive prefix sum
   //!            BlockScan(temp_storage).InclusiveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -1874,16 +1874,16 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide inclusive prefix sum
   //!            BlockScan(temp_storage.scan).IncluisveSum(
   //!                thread_data, thread_data, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
@@ -2123,7 +2123,7 @@ public:
   //!            // Collectively compute the block-wide inclusive prefix max scan
   //!            BlockScan(temp_storage).InclusiveScan(
   //!                thread_data, thread_data, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            d_data[block_offset] = thread_data;
@@ -2516,16 +2516,16 @@ public:
   //!            // Load a segment of consecutive items that are blocked across threads
   //!            int thread_data[4];
   //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Collectively compute the block-wide inclusive prefix max scan
   //!            BlockScan(temp_storage.scan).InclusiveScan(
   //!                thread_data, thread_data, cuda::maximum<>{}, prefix_op);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!
   //!            // Store scanned items to output segment
   //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-  //!            CTA_SYNC();
+  //!            __syncthreads();
   //!        }
   //!
   //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh
index a3dedcc3c70..93d8715c63b 100644
--- a/cub/cub/block/block_shuffle.cuh
+++ b/cub/cub/block/block_shuffle.cuh
@@ -164,7 +164,7 @@ public:
   {
     temp_storage[linear_tid] = input;
 
-    CTA_SYNC();
+    __syncthreads();
 
     const int offset_tid = static_cast<int>(linear_tid) + distance;
     if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS))
@@ -196,7 +196,7 @@ public:
   {
     temp_storage[linear_tid] = input;
 
-    CTA_SYNC();
+    __syncthreads();
 
     unsigned int offset = linear_tid + distance;
     if (offset >= BLOCK_THREADS)
@@ -230,7 +230,7 @@ public:
   {
     temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
@@ -298,7 +298,7 @@ public:
   {
     temp_storage[linear_tid] = input[0];
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index 443f7a7f93b..e207a1d76c1 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -897,7 +897,7 @@ private:
         // subsequent loads
         temp_storage.valid_items = valid_items;
       }
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
     }
   };
@@ -980,7 +980,7 @@ private:
         // subsequent loads
         temp_storage.valid_items = valid_items;
       }
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
     }
   };
@@ -1063,7 +1063,7 @@ private:
         // subsequent loads
         temp_storage.valid_items = valid_items;
       }
-      CTA_SYNC();
+      __syncthreads();
       StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
     }
   };
diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh
index d4fdd9c405f..35bdfe8ee02 100644
--- a/cub/cub/block/radix_rank_sort_operations.cuh
+++ b/cub/cub/block/radix_rank_sort_operations.cuh
@@ -49,6 +49,8 @@
 
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/tuple>
 #include <cuda/std/type_traits>
@@ -437,7 +439,7 @@ struct digit_f
       using traits           = traits_t<typename ::cuda::std::remove_cv<T>::type>;
       using bit_ordered_type = typename traits::bit_ordered_type;
 
-      const ::cuda::std::uint32_t bits_to_copy = min(src_size - src_bit_start, num_bits);
+      const ::cuda::std::uint32_t bits_to_copy = (::cuda::std::min)(src_size - src_bit_start, num_bits);
 
       if (bits_to_copy)
       {
diff --git a/cub/cub/block/specializations/block_histogram_atomic.cuh b/cub/cub/block/specializations/block_histogram_atomic.cuh
index 8edc8575c40..4103641dbe2 100644
--- a/cub/cub/block/specializations/block_histogram_atomic.cuh
+++ b/cub/cub/block/specializations/block_histogram_atomic.cuh
@@ -45,7 +45,8 @@
 #endif // no system header
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
  *        histograms from data samples partitioned across a CUDA thread block.
@@ -72,7 +73,7 @@ struct BlockHistogramAtomic
   template <typename T, typename CounterT, int ITEMS_PER_THREAD>
   _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
   {
-// Update histogram
+    // Update histogram
 #pragma unroll
     for (int i = 0; i < ITEMS_PER_THREAD; ++i)
     {
@@ -80,5 +81,11 @@ struct BlockHistogramAtomic
     }
   }
 };
+} // namespace detail
+
+template <int BINS>
+using BlockHistogramAtomic CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockHistogramAtomic<BINS>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh
index 7ef3c1264a5..127f30953b2 100644
--- a/cub/cub/block/specializations/block_histogram_sort.cuh
+++ b/cub/cub/block/specializations/block_histogram_sort.cuh
@@ -49,7 +49,8 @@
 #include <cub/util_ptx.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide
  *        histograms from data samples partitioned across a CUDA thread block.
@@ -187,7 +188,7 @@ struct BlockHistogramSort
     // Sort bytes in blocked arrangement
     BlockRadixSortT(temp_storage.sort).Sort(items);
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Initialize the shared memory's run_begin and run_end for each bin
     int histo_offset = 0;
@@ -205,7 +206,7 @@ struct BlockHistogramSort
       temp_storage.discontinuities.run_end[histo_offset + linear_tid]   = TILE_SIZE;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     int flags[ITEMS_PER_THREAD]; // unused
 
@@ -219,7 +220,7 @@ struct BlockHistogramSort
       temp_storage.discontinuities.run_begin[items[0]] = 0;
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Composite into histogram
     histo_offset = 0;
@@ -243,5 +244,18 @@ struct BlockHistogramSort
     }
   }
 };
+} // namespace detail
+
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          int BINS,
+          int BLOCK_DIM_Y,
+          int BLOCK_DIM_Z,
+          int LEGACY_PTX_ARCH = 0>
+using BlockHistogramSort CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") =
+  detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh
index 7c1db2c9050..90f8f12236f 100644
--- a/cub/cub/block/specializations/block_reduce_raking.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -50,7 +50,8 @@
 #include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
  *        block. Supports non-commutative reduction operators.
@@ -212,7 +213,7 @@ struct BlockReduceRaking
       // Place partial into shared memory grid.
       *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism to one warp
       if (linear_tid < RAKING_THREADS)
@@ -228,7 +229,7 @@ struct BlockReduceRaking
         // sync before re-using shmem (warp_storage/raking_grid are aliased)
         static_assert(RAKING_THREADS <= CUB_PTX_WARP_THREADS, "RAKING_THREADS must be <= warp size.");
         unsigned int mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
-        WARP_SYNC(mask);
+        __syncwarp(mask);
 
         partial = WarpReduce(temp_storage.warp_storage)
                     .template Reduce<(IS_FULL_TILE && RAKING_UNGUARDED)>(partial, valid_raking_threads, reduction_op);
@@ -257,5 +258,11 @@ struct BlockReduceRaking
     return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockReduceRaking CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
index 49401e87fb4..7841db5f18a 100644
--- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -50,7 +50,8 @@
 #include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction
  *        across a CUDA thread block. Does not support non-commutative reduction operators. Does not
@@ -83,7 +84,7 @@ struct BlockReduceRakingCommutativeOnly
 
   // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have
   // valid values
-  using FallBack = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using FallBack = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
   /// Constants
   enum
@@ -167,7 +168,7 @@ struct BlockReduceRakingCommutativeOnly
           partial;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism to one warp
       if (linear_tid < RAKING_THREADS)
@@ -214,7 +215,7 @@ struct BlockReduceRakingCommutativeOnly
           partial;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism to one warp
       if (linear_tid < RAKING_THREADS)
@@ -231,5 +232,11 @@ struct BlockReduceRakingCommutativeOnly
     return partial;
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
index 4ee2b307bcf..2dfa526771f 100644
--- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -48,8 +48,11 @@
 #include <cub/util_ptx.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
-CUB_NAMESPACE_BEGIN
+#include <cuda/ptx>
 
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
 /**
  * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
  *        across a CUDA thread block. Supports non-commutative reduction operators.
@@ -121,7 +124,7 @@ struct BlockReduceWarpReductions
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
       , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , lane_id(LaneId())
+      , lane_id(::cuda::ptx::get_sreg_laneid())
   {}
 
   /**
@@ -184,7 +187,7 @@ struct BlockReduceWarpReductions
       detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Update total aggregate in warp 0, lane 0
     if (linear_tid == 0)
@@ -254,5 +257,11 @@ struct BlockReduceWarpReductions
     return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh
index f0fe7a5ca2a..2af4b8693fc 100644
--- a/cub/cub/block/specializations/block_scan_raking.cuh
+++ b/cub/cub/block/specializations/block_scan_raking.cuh
@@ -52,7 +52,8 @@
 #include <cub/warp/warp_scan.cuh>
 
 CUB_NAMESPACE_BEGIN
-
+namespace detail
+{
 /**
  * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA
  * thread block.
@@ -302,7 +303,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -318,7 +319,7 @@ struct BlockScanRaking
         ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       exclusive_output = *placement_ptr;
@@ -355,7 +356,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -371,7 +372,7 @@ struct BlockScanRaking
         ExclusiveDownsweep(scan_op, exclusive_partial);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab exclusive partial from shared memory
       output = *placement_ptr;
@@ -410,7 +411,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -433,7 +434,7 @@ struct BlockScanRaking
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -478,7 +479,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -501,7 +502,7 @@ struct BlockScanRaking
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab exclusive partial from shared memory
       output = *placement_ptr;
@@ -559,7 +560,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -588,7 +589,7 @@ struct BlockScanRaking
         ExclusiveDownsweep(scan_op, downsweep_prefix);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -626,7 +627,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -642,7 +643,7 @@ struct BlockScanRaking
         InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -680,7 +681,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -703,7 +704,7 @@ struct BlockScanRaking
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
@@ -758,7 +759,7 @@ struct BlockScanRaking
       T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
       detail::uninitialized_copy_single(placement_ptr, input);
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Reduce parallelism down to just raking threads
       if (linear_tid < RAKING_THREADS)
@@ -787,12 +788,18 @@ struct BlockScanRaking
         InclusiveDownsweep(scan_op, downsweep_prefix);
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Grab thread prefix from shared memory
       output = *placement_ptr;
     }
   }
 };
+} // namespace detail
+
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int LEGACY_PTX_ARCH = 0>
+using BlockScanRaking CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh
index 851a71cbe7b..d034d2838ea 100644
--- a/cub/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -47,8 +47,11 @@
 #include <cub/util_ptx.cuh>
 #include <cub/warp/warp_scan.cuh>
 
-CUB_NAMESPACE_BEGIN
+#include <cuda/ptx>
 
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
 /**
  * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA
  *        thread block.
@@ -127,7 +130,7 @@ struct BlockScanWarpScans
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
       , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , lane_id(LaneId())
+      , lane_id(::cuda::ptx::get_sreg_laneid())
   {}
 
   //---------------------------------------------------------------------
@@ -197,7 +200,7 @@ struct BlockScanWarpScans
       detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate);
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Accumulate block aggregates and save the one that is our warp's prefix
     T warp_prefix;
@@ -423,7 +426,7 @@ struct BlockScanWarpScans
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Incorporate thread block prefix into outputs
     T block_prefix = temp_storage.block_prefix;
@@ -528,12 +531,17 @@ struct BlockScanWarpScans
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
     // Incorporate thread block prefix into outputs
     T block_prefix   = temp_storage.block_prefix;
     exclusive_output = scan_op(block_prefix, exclusive_output);
   }
 };
+} // namespace detail
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh
deleted file mode 100644
index 1d71c6ebc0d..00000000000
--- a/cub/cub/detail/device_synchronize.cuh
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2021 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <cub/config.cuh>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda_runtime_api.h>
-
-#include <cub/detail/detect_cuda_runtime.cuh>
-
-#include <nv/target>
-
-CUB_NAMESPACE_BEGIN
-
-namespace detail
-{
-
-/**
- * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and
- * CUDA configuration.
- */
-_CCCL_EXEC_CHECK_DISABLE
-CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
-{
-  cudaError_t result = cudaErrorNotSupported;
-  NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), ());
-  return result;
-}
-
-} // namespace detail
-
-CUB_NAMESPACE_END
diff --git a/cub/cub/detail/temporary_storage.cuh b/cub/cub/detail/temporary_storage.cuh
index cf5f98e775a..f271ce804a9 100644
--- a/cub/cub/detail/temporary_storage.cuh
+++ b/cub/cub/detail/temporary_storage.cuh
@@ -29,6 +29,8 @@
 #include <cub/util_namespace.cuh>
 #include <cub/util_temporary_storage.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -96,7 +98,7 @@ public:
 private:
   _CCCL_HOST_DEVICE void set_bytes_required(std::size_t new_size)
   {
-    m_size = (max) (m_size, new_size);
+    m_size = (::cuda::std::max)(m_size, new_size);
   }
 
   _CCCL_HOST_DEVICE std::size_t get_bytes_required() const
diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh
index 1af5f01f033..b910bb91a2b 100644
--- a/cub/cub/device/device_adjacent_difference.cuh
+++ b/cub/cub/device/device_adjacent_difference.cuh
@@ -43,8 +43,6 @@
 #include <cub/device/dispatch/dispatch_adjacent_difference.cuh>
 #include <cub/util_namespace.cuh>
 
-#include <thrust/detail/integer_traits.h>
-
 #include <cstdint>
 
 CUB_NAMESPACE_BEGIN
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index 1fb5656b82f..2347666289e 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -41,10 +41,13 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/choose_offset.cuh>
 #include <cub/detail/nvtx.cuh>
 #include <cub/device/dispatch/dispatch_segmented_sort.cuh>
 #include <cub/util_namespace.cuh>
 
+#include <cuda/std/cstdint>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -140,16 +143,19 @@ private:
     std::size_t& temp_storage_bytes,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = false;
+
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
@@ -286,8 +292,8 @@ public:
     std::size_t& temp_storage_bytes,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -313,16 +319,19 @@ private:
     std::size_t& temp_storage_bytes,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = false;
+
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
@@ -454,8 +463,8 @@ public:
     std::size_t& temp_storage_bytes,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -480,17 +489,18 @@ private:
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = true;
-
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<NullType> d_values;
 
@@ -632,8 +642,8 @@ public:
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -650,17 +660,18 @@ private:
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = true;
-
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<NullType> d_values;
 
@@ -803,8 +814,8 @@ public:
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -931,8 +942,8 @@ public:
     std::size_t& temp_storage_bytes,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -1067,8 +1078,8 @@ public:
     std::size_t& temp_storage_bytes,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -1213,8 +1224,8 @@ public:
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -1350,8 +1361,8 @@ public:
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -1371,15 +1382,19 @@ private:
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = false;
-    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
@@ -1539,8 +1554,8 @@ public:
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -1570,15 +1585,19 @@ private:
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = false;
-    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
@@ -1734,8 +1753,8 @@ public:
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -1763,15 +1782,19 @@ private:
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = true;
-    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     return DispatchT::Dispatch(
       d_temp_storage,
@@ -1931,8 +1954,8 @@ public:
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -1958,15 +1981,19 @@ private:
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
     constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = true;
-    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    using OffsetT =
+      detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     return DispatchT::Dispatch(
       d_temp_storage,
@@ -2125,8 +2152,8 @@ public:
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -2281,8 +2308,8 @@ public:
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -2439,8 +2466,8 @@ public:
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -2605,8 +2632,8 @@ public:
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -2768,8 +2795,8 @@ public:
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 5a751181842..241af8cd1d1 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -78,7 +78,7 @@ CUB_NAMESPACE_BEGIN
 //! @cdp_class{DeviceSpmv}
 //!
 //! @endrst
-struct DeviceSpmv
+struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DeviceSpmv
 {
   //! @name CSR matrix operations
   //! @{
@@ -177,18 +177,19 @@ struct DeviceSpmv
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
   template <typename ValueT>
-  CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const ValueT* d_values,
-    const int* d_row_offsets,
-    const int* d_column_indices,
-    const ValueT* d_vector_x,
-    ValueT* d_vector_y,
-    int num_rows,
-    int num_cols,
-    int num_nonzeros,
-    cudaStream_t stream = 0)
+  CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
+  CUB_RUNTIME_FUNCTION static cudaError_t
+    CsrMV(void* d_temp_storage,
+          size_t& temp_storage_bytes,
+          const ValueT* d_values,
+          const int* d_row_offsets,
+          const int* d_column_indices,
+          const ValueT* d_vector_x,
+          ValueT* d_vector_y,
+          int num_rows,
+          int num_cols,
+          int num_nonzeros,
+          cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV");
 
@@ -204,7 +205,9 @@ struct DeviceSpmv
     spmv_params.alpha             = ValueT{1};
     spmv_params.beta              = ValueT{0};
 
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     return DispatchSpmv<ValueT, int>::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
+    _CCCL_SUPPRESS_DEPRECATED_POP
   }
 
   //! @}  end member group
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index a8c733ef309..e717277e520 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -51,6 +51,9 @@
 
 CUB_NAMESPACE_BEGIN
 
+namespace detail::adjacent_difference
+{
+
 template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void
 DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
@@ -78,7 +81,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
 
   // It is OK to introspect the return type or parameter types of the
   // `operator()` function of `__device__` extended lambda within device code.
-  using OutputT = detail::invoke_result_t<DifferenceOpT, InputT, InputT>;
+  using OutputT = invoke_result_t<DifferenceOpT, InputT, InputT>;
 
   using Agent =
     AgentDifference<ActivePolicyT,
@@ -101,6 +104,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
   agent.Process(tile_idx, tile_base);
 }
 
+} // namespace detail::adjacent_difference
+
 template <typename InputIteratorT,
           typename OutputIteratorT,
           typename DifferenceOpT,
@@ -184,21 +189,23 @@ struct DispatchAdjacentDifference
 
       if (MayAlias)
       {
-        using AgentDifferenceInitT = AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;
+        using AgentDifferenceInitT =
+          detail::adjacent_difference::AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;
 
         constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
         const int init_grid_size      = ::cuda::ceil_div(num_tiles, init_block_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
                 "<<<%d, %d, 0, %lld>>>()\n",
                 init_grid_size,
                 init_block_size,
                 reinterpret_cast<long long>(stream));
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
-          .doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
+          .doit(detail::adjacent_difference::
+                  DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
                 d_input,
                 first_tile_previous,
                 num_tiles,
@@ -219,17 +226,17 @@ struct DispatchAdjacentDifference
         }
       }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
               "<<<%d, %d, 0, %lld>>>()\n",
               num_tiles,
               AdjacentDifferencePolicyT::BLOCK_THREADS,
               reinterpret_cast<long long>(stream));
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
         num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
-        .doit(DeviceAdjacentDifferenceDifferenceKernel<
+        .doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel<
                 typename PolicyHub::MaxPolicy,
                 InputIteratorT,
                 OutputIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 46ff7cbced6..c870221f3e1 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -54,6 +54,8 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 #include <cstdint>
@@ -62,6 +64,8 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail
 {
+namespace batch_memcpy
+{
 /**
  * Initialization kernel for tile status initialization (multi-block)
  */
@@ -100,15 +104,13 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
 {
   using StatusWord    = typename TileT::StatusWord;
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
-  using BufferSizeT   = cub::detail::value_t<BufferSizeIteratorT>;
+  using BufferSizeT   = value_t<BufferSizeIteratorT>;
   /// Internal load/store type. For byte-wise memcpy, a single-byte type
-  using AliasT =
-    typename ::cuda::std::conditional<IsMemcpy,
-                                      std::iterator_traits<char*>,
-                                      std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
+  using AliasT = typename ::cuda::std::
+    conditional<IsMemcpy, std::iterator_traits<char*>, std::iterator_traits<value_t<InputBufferIt>>>::type::value_type;
   /// Types of the input and output buffers
-  using InputBufferT  = cub::detail::value_t<InputBufferIt>;
-  using OutputBufferT = cub::detail::value_t<OutputBufferIt>;
+  using InputBufferT  = value_t<InputBufferIt>;
+  using OutputBufferT = value_t<OutputBufferIt>;
 
   constexpr uint32_t BLOCK_THREADS    = ActivePolicyT::BLOCK_THREADS;
   constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD;
@@ -131,7 +133,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
 
     // Make sure thread 0 does not overwrite the buffer id before other threads have finished with
     // the prior iteration of the loop
-    CTA_SYNC();
+    __syncthreads();
 
     // Binary search the buffer that this tile belongs to
     if (threadIdx.x == 0)
@@ -140,7 +142,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
     }
 
     // Make sure thread 0 has written the buffer this thread block is assigned to
-    CTA_SYNC();
+    __syncthreads();
 
     const BufferOffsetT buffer_id = block_buffer_id;
 
@@ -173,7 +175,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
       copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
         input_buffer_it[buffer_id],
         output_buffer_it[buffer_id],
-        (cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
+        (::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
         tile_offset_within_buffer);
     }
 
@@ -229,7 +231,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO
     BLevBlockOffsetTileState blev_block_scan_state)
 {
   // Internal type used for storing a buffer's size
-  using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
+  using BufferSizeT = value_t<BufferSizeIteratorT>;
 
   // Alias the correct tuning policy for the current compilation pass' architecture
   using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT;
@@ -268,6 +270,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO
     blev_block_scan_state)
     .ConsumeTile(blockIdx.x);
 }
+} // namespace batch_memcpy
 
 /**
  * @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers
@@ -462,8 +465,8 @@ struct DispatchBatchMemcpy
 
     // Kernels
     auto init_scan_states_kernel =
-      InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
-    auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel<
+      detail::batch_memcpy::InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
+    auto batch_memcpy_non_blev_kernel = detail::batch_memcpy::BatchMemcpyKernel<
       typename PolicyHub::MaxPolicy,
       InputBufferIt,
       OutputBufferIt,
@@ -478,7 +481,7 @@ struct DispatchBatchMemcpy
       BLevBlockOffsetTileState,
       IsMemcpy>;
 
-    auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel<
+    auto multi_block_memcpy_kernel = detail::batch_memcpy::MultiBlockBatchMemcpyKernel<
       typename PolicyHub::MaxPolicy,
       BufferOffsetT,
       BlevBufferSrcsOutItT,
@@ -536,7 +539,7 @@ struct DispatchBatchMemcpy
       return error;
     }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(init_grid_size),
@@ -564,7 +567,7 @@ struct DispatchBatchMemcpy
       return error;
     }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(batch_memcpy_grid_size),
@@ -603,7 +606,7 @@ struct DispatchBatchMemcpy
       return error;
     }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(batch_memcpy_blev_grid_size),
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 7ba478e3c00..895ac9821fb 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -51,10 +51,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
-{
-
-namespace for_each
+namespace detail::for_each
 {
 
 // The dispatch layer is in the detail namespace until we figure out tuning API
@@ -101,7 +98,7 @@ struct dispatch_t
     const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
     const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
             "%d items per thread\n",
             static_cast<int>(num_tiles),
@@ -144,7 +141,7 @@ struct dispatch_t
     const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
     const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
             "%d items per thread\n",
             static_cast<int>(num_tiles),
@@ -195,8 +192,6 @@ struct dispatch_t
   }
 };
 
-} // namespace for_each
-
-} // namespace detail
+} // namespace detail::for_each
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
index 6e346316d48..a4770a1b98c 100644
--- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
+++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
@@ -73,9 +73,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
-{
-namespace for_each_in_extents
+namespace detail::for_each_in_extents
 {
 
 // The dispatch layer is in the detail namespace until we figure out the tuning API
@@ -117,7 +115,7 @@ public:
     constexpr unsigned items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
     unsigned num_cta                    = ::cuda::ceil_div(_size, block_threads * items_per_thread);
 
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#  ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each_in_extents::static_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
             num_cta,
             block_threads,
@@ -155,7 +153,7 @@ public:
     _CUB_RETURN_IF_ERROR(status)
     unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread);
 
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#  ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each_in_extents::dynamic_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
             num_cta,
             block_threads,
@@ -203,8 +201,7 @@ private:
   unsigned_index_type _size;
 };
 
-} // namespace for_each_in_extents
-} // namespace detail
+} // namespace detail::for_each_in_extents
 
 CUB_NAMESPACE_END
 
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 900f758cdfb..2ac4e160220 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -1,4 +1,3 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
@@ -98,6 +97,9 @@ CUB_NAMESPACE_BEGIN
  * @param tile_queue
  *   Drain queue descriptor for dynamically mapping tile data onto thread blocks
  */
+namespace detail::histogram
+{
+
 template <int NUM_ACTIVE_CHANNELS, typename CounterT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramInitKernel(
   ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,
@@ -254,9 +256,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK
   agent.StoreOutput();
 }
 
-namespace detail
-{
-
 template <int NUM_CHANNELS,
           int NUM_ACTIVE_CHANNELS,
           int PRIVATIZED_SMEM_BINS,
@@ -434,12 +433,12 @@ struct dispatch_histogram
         (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
 
 // Log DeviceHistogramInitKernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
               histogram_init_grid_dims,
               histogram_init_block_threads,
               (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke histogram_init_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -453,7 +452,7 @@ struct dispatch_histogram
       }
 
 // Log histogram_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels "
               "per thread, %d SM occupancy\n",
               sweep_grid_dims.x,
@@ -463,7 +462,7 @@ struct dispatch_histogram
               (long long) stream,
               pixels_per_thread,
               histogram_sweep_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke histogram_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
@@ -503,20 +502,21 @@ struct dispatch_histogram
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
   {
     return Invoke<ActivePolicyT>(
-      DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-      DeviceHistogramSweepKernel<MaxPolicyT,
-                                 PRIVATIZED_SMEM_BINS,
-                                 NUM_CHANNELS,
-                                 NUM_ACTIVE_CHANNELS,
-                                 SampleIteratorT,
-                                 CounterT,
-                                 PrivatizedDecodeOpT,
-                                 OutputDecodeOpT,
-                                 OffsetT>);
+      detail::histogram::DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+      detail::histogram::DeviceHistogramSweepKernel<
+        MaxPolicyT,
+        PRIVATIZED_SMEM_BINS,
+        NUM_CHANNELS,
+        NUM_ACTIVE_CHANNELS,
+        SampleIteratorT,
+        CounterT,
+        PrivatizedDecodeOpT,
+        OutputDecodeOpT,
+        OffsetT>);
   }
 };
 
-} // namespace detail
+} // namespace detail::histogram
 
 /******************************************************************************
  * Dispatch
@@ -959,7 +959,7 @@ public:
         // Too many bins to keep in shared memory.
         constexpr int PRIVATIZED_SMEM_BINS = 0;
 
-        detail::dispatch_histogram<
+        detail::histogram::dispatch_histogram<
           NUM_CHANNELS,
           NUM_ACTIVE_CHANNELS,
           PRIVATIZED_SMEM_BINS,
@@ -995,7 +995,7 @@ public:
         // Dispatch shared-privatized approach
         constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
 
-        detail::dispatch_histogram<
+        detail::histogram::dispatch_histogram<
           NUM_CHANNELS,
           NUM_ACTIVE_CHANNELS,
           PRIVATIZED_SMEM_BINS,
@@ -1129,7 +1129,7 @@ public:
 
       constexpr int PRIVATIZED_SMEM_BINS = 256;
 
-      detail::dispatch_histogram<
+      detail::histogram::dispatch_histogram<
         NUM_CHANNELS,
         NUM_ACTIVE_CHANNELS,
         PRIVATIZED_SMEM_BINS,
@@ -1277,7 +1277,7 @@ public:
         // Dispatch shared-privatized approach
         constexpr int PRIVATIZED_SMEM_BINS = 0;
 
-        detail::dispatch_histogram<
+        detail::histogram::dispatch_histogram<
           NUM_CHANNELS,
           NUM_ACTIVE_CHANNELS,
           PRIVATIZED_SMEM_BINS,
@@ -1313,7 +1313,7 @@ public:
         // Dispatch shared-privatized approach
         constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
 
-        detail::dispatch_histogram<
+        detail::histogram::dispatch_histogram<
           NUM_CHANNELS,
           NUM_ACTIVE_CHANNELS,
           PRIVATIZED_SMEM_BINS,
@@ -1451,7 +1451,7 @@ public:
 
       constexpr int PRIVATIZED_SMEM_BINS = 256;
 
-      detail::dispatch_histogram<
+      detail::histogram::dispatch_histogram<
         NUM_CHANNELS,
         NUM_ACTIVE_CHANNELS,
         PRIVATIZED_SMEM_BINS,
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index ff43656c5c5..b3d0c8ab2ca 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -21,10 +21,11 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 CUB_NAMESPACE_BEGIN
-namespace detail
-{
-namespace merge
+namespace detail::merge
 {
 _CCCL_INLINE_VAR constexpr int fallback_BLOCK_THREADS    = 64;
 _CCCL_INLINE_VAR constexpr int fallback_ITEMS_PER_THREAD = 1;
@@ -80,7 +81,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
   const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (partition_idx < num_partitions)
   {
-    const Offset partition_at       = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
+    const Offset partition_at       = (::cuda::std::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
     merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
   }
 }
@@ -300,6 +301,5 @@ struct dispatch_t
     return cudaSuccess;
   }
 };
-} // namespace merge
-} // namespace detail
+} // namespace detail::merge
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 1d455bdfbf1..e8cc91e8420 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -47,11 +47,13 @@
 #include <thrust/detail/integer_math.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
+namespace detail::merge_sort
 {
 
 /**
@@ -95,7 +97,7 @@ private:
   using block_sort_helper_t = dual_policy_agent_helper_t<
     DefaultPolicyT,
     fallback_policy_t,
-    AgentBlockSort,
+    merge_sort::AgentBlockSort,
     KeyInputIteratorT,
     ValueInputIteratorT,
     KeyIteratorT,
@@ -112,7 +114,7 @@ private:
   using merge_helper_t = dual_policy_agent_helper_t<
     DefaultPolicyT,
     fallback_policy_t,
-    AgentMerge,
+    merge_sort::AgentMerge,
     KeyIteratorT,
     ValueIteratorT,
     OffsetT,
@@ -125,9 +127,10 @@ private:
   // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory
   // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the
   // available shared memory
-  static constexpr auto max_default_size = (cub::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
+  static constexpr auto max_default_size =
+    (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
   static constexpr auto max_fallback_size =
-    (cub::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
+    (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
   static constexpr bool uses_fallback_policy =
     (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block);
 
@@ -137,8 +140,6 @@ public:
     ::cuda::std::_If<uses_fallback_policy, fallback_block_sort_agent_t, default_block_sort_agent_t>;
   using merge_agent_t = ::cuda::std::_If<uses_fallback_policy, fallback_merge_agent_t, default_merge_agent_t>;
 };
-} // namespace detail
-
 template <typename ChainedPolicyT,
           typename KeyInputIteratorT,
           typename ValueInputIteratorT,
@@ -149,16 +150,15 @@ template <typename ChainedPolicyT,
           typename KeyT,
           typename ValueT>
 __launch_bounds__(
-  cub::detail::merge_sort_vsmem_helper_t<
-    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
-    KeyInputIteratorT,
-    ValueInputIteratorT,
-    KeyIteratorT,
-    ValueIteratorT,
-    OffsetT,
-    CompareOpT,
-    KeyT,
-    ValueT>::policy_t::BLOCK_THREADS)
+  merge_sort_vsmem_helper_t<typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+                            KeyInputIteratorT,
+                            ValueInputIteratorT,
+                            KeyIteratorT,
+                            ValueIteratorT,
+                            OffsetT,
+                            CompareOpT,
+                            KeyT,
+                            ValueT>::policy_t::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(
     bool ping,
     KeyInputIteratorT keys_in,
@@ -169,9 +169,9 @@ __launch_bounds__(
     KeyT* tmp_keys_out,
     ValueT* tmp_items_out,
     CompareOpT compare_op,
-    cub::detail::vsmem_t vsmem)
+    vsmem_t vsmem)
 {
-  using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t<
+  using MergeSortHelperT = merge_sort_vsmem_helper_t<
     typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
     KeyInputIteratorT,
     ValueInputIteratorT,
@@ -186,7 +186,7 @@ __launch_bounds__(
 
   using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t;
 
-  using VSmemHelperT = cub::detail::vsmem_helper_impl<AgentBlockSortT>;
+  using VSmemHelperT = vsmem_helper_impl<AgentBlockSortT>;
 
   // Static shared memory allocation
   __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
@@ -256,16 +256,15 @@ template <typename ChainedPolicyT,
           typename KeyT,
           typename ValueT>
 __launch_bounds__(
-  cub::detail::merge_sort_vsmem_helper_t<
-    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
-    KeyInputIteratorT,
-    ValueInputIteratorT,
-    KeyIteratorT,
-    ValueIteratorT,
-    OffsetT,
-    CompareOpT,
-    KeyT,
-    ValueT>::policy_t::BLOCK_THREADS)
+  merge_sort_vsmem_helper_t<typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+                            KeyInputIteratorT,
+                            ValueInputIteratorT,
+                            KeyIteratorT,
+                            ValueIteratorT,
+                            OffsetT,
+                            CompareOpT,
+                            KeyT,
+                            ValueT>::policy_t::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(
     bool ping,
     KeyIteratorT keys_ping,
@@ -276,9 +275,9 @@ __launch_bounds__(
     CompareOpT compare_op,
     OffsetT* merge_partitions,
     OffsetT target_merged_tiles_number,
-    cub::detail::vsmem_t vsmem)
+    vsmem_t vsmem)
 {
-  using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t<
+  using MergeSortHelperT = merge_sort_vsmem_helper_t<
     typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
     KeyInputIteratorT,
     ValueInputIteratorT,
@@ -293,7 +292,7 @@ __launch_bounds__(
 
   using AgentMergeT = typename MergeSortHelperT::merge_agent_t;
 
-  using VSmemHelperT = cub::detail::vsmem_helper_impl<AgentMergeT>;
+  using VSmemHelperT = vsmem_helper_impl<AgentMergeT>;
 
   // Static shared memory allocation
   __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
@@ -323,6 +322,8 @@ __launch_bounds__(
   VSmemHelperT::discard_temp_storage(temp_storage);
 }
 
+} // namespace detail::merge_sort
+
 /*******************************************************************************
  * Policy
  ******************************************************************************/
@@ -405,7 +406,7 @@ struct DispatchMergeSort
   {
     using MergePolicyT = typename ActivePolicyT::MergeSortPolicy;
 
-    using merge_sort_helper_t = cub::detail::merge_sort_vsmem_helper_t<
+    using merge_sort_helper_t = detail::merge_sort::merge_sort_vsmem_helper_t<
       MergePolicyT,
       KeyInputIteratorT,
       ValueInputIteratorT,
@@ -416,8 +417,8 @@ struct DispatchMergeSort
       KeyT,
       ValueT>;
 
-    using BlockSortVSmemHelperT  = cub::detail::vsmem_helper_impl<typename merge_sort_helper_t::block_sort_agent_t>;
-    using MergeAgentVSmemHelperT = cub::detail::vsmem_helper_impl<typename merge_sort_helper_t::merge_agent_t>;
+    using BlockSortVSmemHelperT  = detail::vsmem_helper_impl<typename merge_sort_helper_t::block_sort_agent_t>;
+    using MergeAgentVSmemHelperT = detail::vsmem_helper_impl<typename merge_sort_helper_t::merge_agent_t>;
 
     cudaError error = cudaSuccess;
 
@@ -445,7 +446,7 @@ struct DispatchMergeSort
        */
       const std::size_t block_sort_smem_size       = num_tiles * BlockSortVSmemHelperT::vsmem_per_block;
       const std::size_t merge_smem_size            = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block;
-      const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size);
+      const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);
 
       void* allocations[4]            = {nullptr, nullptr, nullptr, nullptr};
       std::size_t allocation_sizes[4] = {
@@ -486,7 +487,7 @@ struct DispatchMergeSort
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
         static_cast<int>(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true)
         .doit(
-          DeviceMergeSortBlockSortKernel<
+          detail::merge_sort::DeviceMergeSortBlockSortKernel<
             typename PolicyHub::MaxPolicy,
             KeyInputIteratorT,
             ValueInputIteratorT,
@@ -544,7 +545,7 @@ struct DispatchMergeSort
         // Partition
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
           partition_grid_size, threads_per_partition_block, 0, stream, true)
-          .doit(DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
+          .doit(detail::merge_sort::DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
                 ping,
                 d_output_keys,
                 keys_buffer,
@@ -572,15 +573,16 @@ struct DispatchMergeSort
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
           static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true)
           .doit(
-            DeviceMergeSortMergeKernel<typename PolicyHub::MaxPolicy,
-                                       KeyInputIteratorT,
-                                       ValueInputIteratorT,
-                                       KeyIteratorT,
-                                       ValueIteratorT,
-                                       OffsetT,
-                                       CompareOpT,
-                                       KeyT,
-                                       ValueT>,
+            detail::merge_sort::DeviceMergeSortMergeKernel<
+              typename PolicyHub::MaxPolicy,
+              KeyInputIteratorT,
+              ValueInputIteratorT,
+              KeyIteratorT,
+              ValueIteratorT,
+              OffsetT,
+              CompareOpT,
+              KeyT,
+              ValueT>,
             ping,
             d_output_keys,
             d_output_items,
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 0d4d9bf1ea9..18bbd99d00d 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -77,6 +77,9 @@ CUB_NAMESPACE_BEGIN
  * Kernel entry points
  *****************************************************************************/
 
+namespace detail::radix_sort
+{
+
 /**
  * @brief Upsweep digit-counting kernel entry point (multi-block).
  *        Computes privatized digit histograms, one per block.
@@ -149,7 +152,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
   };
 
   // Parameterize AgentRadixSortUpsweep type for the current configuration
-  using AgentRadixSortUpsweepT = AgentRadixSortUpsweep<ActiveUpsweepPolicyT, KeyT, OffsetT, DecomposerT>;
+  using AgentRadixSortUpsweepT =
+    detail::radix_sort::AgentRadixSortUpsweep<ActiveUpsweepPolicyT, KeyT, OffsetT, DecomposerT>;
 
   // Shared memory storage
   __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
@@ -161,7 +165,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
 
   upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
 
-  CTA_SYNC();
+  __syncthreads();
 
   // Write out digit counts (striped)
   upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
@@ -190,13 +194,13 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS),
 {
   // Parameterize the AgentScan type for the current configuration
   using AgentScanT =
-    AgentScan<typename ChainedPolicyT::ActivePolicy::ScanPolicy,
-              OffsetT*,
-              OffsetT*,
-              ::cuda::std::plus<>,
-              OffsetT,
-              OffsetT,
-              OffsetT>;
+    detail::scan::AgentScan<typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+                            OffsetT*,
+                            OffsetT*,
+                            ::cuda::std::plus<>,
+                            OffsetT,
+                            OffsetT,
+                            OffsetT>;
 
   // Shared memory storage
   __shared__ typename AgentScanT::TempStorage temp_storage;
@@ -308,7 +312,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
   };
 
   // Parameterize AgentRadixSortDownsweep type for the current configuration
-  using AgentRadixSortDownsweepT =
+  using AgentRadixSortDownsweepT = detail::radix_sort::
     AgentRadixSortDownsweep<ActiveDownsweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
 
   // Shared memory storage
@@ -432,7 +436,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
   // Load keys
   BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
 
-  CTA_SYNC();
+  __syncthreads();
 
   // Load values
   if (!KEYS_ONLY)
@@ -443,7 +447,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
 
     BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
 
-    CTA_SYNC();
+    __syncthreads();
   }
 
   // Sort tile
@@ -570,13 +574,14 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
   };
 
   // Upsweep type
-  using BlockUpsweepT = AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT, DecomposerT>;
+  using BlockUpsweepT = detail::radix_sort::AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT, DecomposerT>;
 
   // Digit-scan type
   using DigitScanT = BlockScan<OffsetT, BLOCK_THREADS>;
 
   // Downsweep type
-  using BlockDownsweepT = AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
+  using BlockDownsweepT =
+    detail::radix_sort::AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
 
   enum
   {
@@ -616,13 +621,13 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
   BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer);
   upsweep.ProcessRegion(segment_begin, segment_end);
 
-  CTA_SYNC();
+  __syncthreads();
 
   // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
   OffsetT bin_count[BINS_TRACKED_PER_THREAD];
   upsweep.ExtractCounts(bin_count);
 
-  CTA_SYNC();
+  __syncthreads();
 
   if (IS_DESCENDING)
   {
@@ -638,7 +643,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -677,7 +682,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
       }
     }
 
-    CTA_SYNC();
+    __syncthreads();
 
 #pragma unroll
     for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
@@ -691,7 +696,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
     }
   }
 
-  CTA_SYNC();
+  __syncthreads();
 
   // Downsweep
   BlockDownsweepT downsweep(
@@ -729,7 +734,8 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
   OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {})
 {
   using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy;
-  using AgentT           = AgentRadixSortHistogram<HistogramPolicyT, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
+  using AgentT =
+    detail::radix_sort::AgentRadixSortHistogram<HistogramPolicyT, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
   __shared__ typename AgentT::TempStorage temp_storage;
   AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit, decomposer);
   agent.Process();
@@ -759,7 +765,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy
     DecomposerT decomposer = {})
 {
   using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy;
-  using AgentT =
+  using AgentT          = detail::radix_sort::
     AgentRadixSortOnesweep<OnesweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT, DecomposerT>;
   __shared__ typename AgentT::TempStorage s;
 
@@ -824,6 +830,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_b
   }
 }
 
+} // namespace detail::radix_sort
+
 /******************************************************************************
  * Single-problem dispatch
  ******************************************************************************/
@@ -965,7 +973,7 @@ struct DispatchRadixSort
       }
 
 // Log single_tile_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit "
               "%d, bit_grain %d\n",
               1,
@@ -1036,7 +1044,7 @@ struct DispatchRadixSort
       int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
 
 // Log upsweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, "
               "bit_grain %d\n",
               pass_config.even_share.grid_size,
@@ -1078,7 +1086,7 @@ struct DispatchRadixSort
       }
 
 // Log scan_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
               1,
               pass_config.scan_config.block_threads,
@@ -1105,7 +1113,7 @@ struct DispatchRadixSort
       }
 
 // Log downsweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
               pass_config.even_share.grid_size,
               pass_config.downsweep_config.block_threads,
@@ -1295,7 +1303,8 @@ struct DispatchRadixSort
 
       constexpr int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS;
       int histo_blocks_per_sm           = 1;
-      auto histogram_kernel = DeviceRadixSortHistogramKernel<max_policy_t, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
+      auto histogram_kernel =
+        detail::radix_sort::DeviceRadixSortHistogramKernel<max_policy_t, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
 
       error = CubDebug(
         cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0));
@@ -1305,7 +1314,7 @@ struct DispatchRadixSort
       }
 
 // log histogram_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, "
               "%d SM occupancy, bit_grain %d\n",
               histo_blocks_per_sm * num_sms,
@@ -1335,7 +1344,7 @@ struct DispatchRadixSort
       constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
 
 // log exclusive_sum_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n",
               num_passes,
               SCAN_BLOCK_THREADS,
@@ -1344,7 +1353,7 @@ struct DispatchRadixSort
 #endif
 
       error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
-                .doit(DeviceRadixSortExclusiveSumKernel<max_policy_t, OffsetT>, d_bins);
+                .doit(detail::radix_sort::DeviceRadixSortExclusiveSumKernel<max_policy_t, OffsetT>, d_bins);
       error = CubDebug(error);
       if (cudaSuccess != error)
       {
@@ -1383,7 +1392,7 @@ struct DispatchRadixSort
           }
 
 // log onesweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
           _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, "
                   "current bit %d, bit_grain %d, portion %d/%d\n",
                   num_blocks,
@@ -1396,7 +1405,7 @@ struct DispatchRadixSort
                   static_cast<int>(num_portions));
 #endif
 
-          auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
+          auto onesweep_kernel = detail::radix_sort::DeviceRadixSortOnesweepKernel<
             max_policy_t,
             IS_DESCENDING,
             KeyT,
@@ -1647,11 +1656,13 @@ struct DispatchRadixSort
   {
     // Invoke upsweep-downsweep
     return InvokePasses<ActivePolicyT>(
-      DeviceRadixSortUpsweepKernel<max_policy_t, false, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
-      DeviceRadixSortUpsweepKernel<max_policy_t, true, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
-      RadixSortScanBinsKernel<max_policy_t, OffsetT>,
-      DeviceRadixSortDownsweepKernel<max_policy_t, false, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>,
-      DeviceRadixSortDownsweepKernel<max_policy_t, true, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
+      detail::radix_sort::DeviceRadixSortUpsweepKernel<max_policy_t, false, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
+      detail::radix_sort::DeviceRadixSortUpsweepKernel<max_policy_t, true, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
+      detail::radix_sort::RadixSortScanBinsKernel<max_policy_t, OffsetT>,
+      detail::radix_sort::
+        DeviceRadixSortDownsweepKernel<max_policy_t, false, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>,
+      detail::radix_sort::
+        DeviceRadixSortDownsweepKernel<max_policy_t, true, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
   }
 
   template <typename ActivePolicyT>
@@ -1672,7 +1683,7 @@ struct DispatchRadixSort
     }
 
 // Copy keys
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream);
 #endif
     cudaError_t error = cudaSuccess;
@@ -1694,7 +1705,7 @@ struct DispatchRadixSort
     // Copy values if necessary
     if (!KEYS_ONLY)
     {
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream);
 #endif
       error = CubDebug(cudaMemcpyAsync(
@@ -1751,7 +1762,8 @@ struct DispatchRadixSort
     {
       // Small, single tile size
       return InvokeSingleTile<ActivePolicyT>(
-        DeviceRadixSortSingleTileKernel<max_policy_t, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
+        detail::radix_sort::
+          DeviceRadixSortSingleTileKernel<max_policy_t, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
     }
     else
     {
@@ -2001,7 +2013,7 @@ struct DispatchSegmentedRadixSort
       int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
 
 // Log kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
               "%lld items per thread, %lld SM occupancy, "
               "current bit %d, bit_grain %d\n",
@@ -2223,7 +2235,7 @@ struct DispatchSegmentedRadixSort
 
     // Force kernel code-generation in all compiler passes
     return InvokePasses<ActivePolicyT>(
-      DeviceSegmentedRadixSortKernel<
+      detail::radix_sort::DeviceSegmentedRadixSortKernel<
         max_policy_t,
         false,
         IS_DESCENDING,
@@ -2233,7 +2245,7 @@ struct DispatchSegmentedRadixSort
         EndOffsetIteratorT,
         OffsetT,
         DecomposerT>,
-      DeviceSegmentedRadixSortKernel<
+      detail::radix_sort::DeviceSegmentedRadixSortKernel<
         max_policy_t,
         true,
         IS_DESCENDING,
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 0cca1e1a982..fee10767875 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -66,6 +66,9 @@ _CCCL_SUPPRESS_DEPRECATED_POP
 
 CUB_NAMESPACE_BEGIN
 
+namespace detail::reduce
+{
+
 /// Normalize input iterator to segment offset
 template <typename T, typename OffsetT, typename IteratorT>
 _CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/)
@@ -187,7 +190,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
 
   if (threadIdx.x == 0)
   {
-    detail::reduce::finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate);
+    finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate);
   }
 }
 
@@ -230,6 +233,7 @@ struct DeviceReduceKernelSource
     return sizeof(AccumT);
   }
 };
+} // namespace detail::reduce
 
 /******************************************************************************
  * Single-problem dispatch
@@ -263,7 +267,7 @@ template <typename InputIteratorT,
           typename AccumT    = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
           typename PolicyHub = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
           typename TransformOpT = ::cuda::std::__identity,
-          typename KernelSource = DeviceReduceKernelSource<
+          typename KernelSource = detail::reduce::DeviceReduceKernelSource<
             typename PolicyHub::MaxPolicy,
             InputIteratorT,
             OutputIteratorT,
@@ -378,13 +382,13 @@ struct DispatchReduce
       }
 
 // Log single_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
               "%d items per thread\n",
               policy.SingleTile().BlockThreads(),
               (long long) stream,
               policy.SingleTile().ItemsPerThread());
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke single_reduce_sweep_kernel
       launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream)
@@ -490,7 +494,7 @@ struct DispatchReduce
       int reduce_grid_size = even_share.grid_size;
 
 // Log device_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
               "per thread, %d SM occupancy\n",
               (unsigned long) reduce_grid_size,
@@ -498,7 +502,7 @@ struct DispatchReduce
               (long long) stream,
               active_policy.Reduce().ItemsPerThread(),
               reduce_config.sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke DeviceReduceKernel
       launcher_factory(reduce_grid_size, active_policy.Reduce().BlockThreads(), 0, stream)
@@ -519,13 +523,13 @@ struct DispatchReduce
       }
 
 // Log single_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
               "%d items per thread\n",
               active_policy.SingleTile().BlockThreads(),
               (long long) stream,
               active_policy.SingleTile().ItemsPerThread());
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke DeviceReduceSingleTileKernel
       launcher_factory(1, active_policy.SingleTile().BlockThreads(), 0, stream)
@@ -698,7 +702,7 @@ template <
   typename AccumT = ::cuda::std::
     __accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
   typename PolicyHub    = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
-  typename KernelSource = DeviceReduceKernelSource<
+  typename KernelSource = detail::reduce::DeviceReduceKernelSource<
     typename PolicyHub::MaxPolicy,
     InputIteratorT,
     OutputIteratorT,
@@ -881,7 +885,7 @@ struct DispatchSegmentedReduce
       }
 
 // Log device_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
               "%d items per thread, %d SM occupancy\n",
               num_segments,
@@ -889,7 +893,7 @@ struct DispatchSegmentedReduce
               (long long) stream,
               ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
               segmented_reduce_config.sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke DeviceReduceKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -920,7 +924,7 @@ struct DispatchSegmentedReduce
   {
     // Force kernel code-generation in all compiler passes
     return InvokePasses<ActivePolicyT>(
-      DeviceSegmentedReduceKernel<
+      detail::reduce::DeviceSegmentedReduceKernel<
         typename PolicyHub::MaxPolicy,
         InputIteratorT,
         OutputIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 804371588f3..d13a9c10b64 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -64,6 +64,9 @@ CUB_NAMESPACE_BEGIN
  * Kernel entry points
  *****************************************************************************/
 
+namespace detail::reduce
+{
+
 /**
  * @brief Multi-block reduce-by-key sweep kernel entry point
  *
@@ -176,6 +179,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
     .ConsumeRange(num_items, tile_state, start_tile);
 }
 
+} // namespace detail::reduce
+
 /******************************************************************************
  * Dispatch
  ******************************************************************************/
@@ -341,9 +346,9 @@ struct DispatchReduceByKey
       // Log init_kernel configuration
       int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -391,7 +396,7 @@ struct DispatchReduceByKey
       for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
       {
 // Log reduce_by_key_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
                 "items per thread, %d SM occupancy\n",
                 start_tile,
@@ -400,7 +405,7 @@ struct DispatchReduceByKey
                 (long long) stream,
                 items_per_thread,
                 reduce_by_key_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke reduce_by_key_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
@@ -439,8 +444,8 @@ struct DispatchReduceByKey
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
   {
     return Invoke<ActivePolicyT>(
-      DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-      DeviceReduceByKeyKernel<
+      detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+      detail::reduce::DeviceReduceByKeyKernel<
         typename PolicyHub::MaxPolicy,
         KeysInputIteratorT,
         UniqueOutputIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index b1542462a58..697edc8f4e6 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -65,6 +65,9 @@ CUB_NAMESPACE_BEGIN
  * Kernel entry points
  *****************************************************************************/
 
+namespace detail::rle
+{
+
 /**
  * Select kernel entry point (multi-block)
  *
@@ -152,6 +155,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
   AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items)
     .ConsumeRange(num_tiles, tile_status, d_num_runs_out);
 }
+} // namespace detail::rle
 
 /******************************************************************************
  * Dispatch
@@ -349,12 +353,12 @@ struct DeviceRleDispatch
       // Log device_scan_init_kernel configuration
       int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
               init_grid_size,
               INIT_KERNEL_THREADS,
               (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -405,7 +409,7 @@ struct DeviceRleDispatch
       scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
 
 // Log device_rle_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per "
               "thread, %d SM occupancy\n",
               scan_grid_size.x,
@@ -415,7 +419,7 @@ struct DeviceRleDispatch
               (long long) stream,
               items_per_thread,
               device_rle_kernel_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke device_rle_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
@@ -451,15 +455,16 @@ struct DeviceRleDispatch
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
   {
     return Invoke<ActivePolicyT>(
-      DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-      DeviceRleSweepKernel<typename PolicyHub::MaxPolicy,
-                           InputIteratorT,
-                           OffsetsOutputIteratorT,
-                           LengthsOutputIteratorT,
-                           NumRunsOutputIteratorT,
-                           ScanTileStateT,
-                           EqualityOpT,
-                           OffsetT>);
+      detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+      detail::rle::DeviceRleSweepKernel<
+        typename PolicyHub::MaxPolicy,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        NumRunsOutputIteratorT,
+        ScanTileStateT,
+        EqualityOpT,
+        OffsetT>);
   }
 
   /**
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 4db31cf6989..2fdfe29ab7f 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -260,9 +260,9 @@ struct DispatchScan
       // Log init_kernel configuration
       int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -305,7 +305,7 @@ struct DispatchScan
       for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
       {
 // Log scan_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
                 "per thread, %d SM occupancy\n",
                 start_tile,
@@ -314,7 +314,7 @@ struct DispatchScan
                 (long long) stream,
                 Policy::ITEMS_PER_THREAD,
                 scan_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke scan_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
@@ -345,16 +345,17 @@ struct DispatchScan
     using ScanTileStateT = typename cub::ScanTileState<AccumT>;
     // Ensure kernels are instantiated.
     return Invoke<ActivePolicyT>(
-      DeviceScanInitKernel<ScanTileStateT>,
-      DeviceScanKernel<typename PolicyHub::MaxPolicy,
-                       InputIteratorT,
-                       OutputIteratorT,
-                       ScanTileStateT,
-                       ScanOpT,
-                       InitValueT,
-                       OffsetT,
-                       AccumT,
-                       ForceInclusive>);
+      detail::scan::DeviceScanInitKernel<ScanTileStateT>,
+      detail::scan::DeviceScanKernel<
+        typename PolicyHub::MaxPolicy,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanTileStateT,
+        ScanOpT,
+        InitValueT,
+        OffsetT,
+        AccumT,
+        ForceInclusive>);
   }
 
   /**
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index c88656dff48..9478543ab3b 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -63,6 +63,9 @@ CUB_NAMESPACE_BEGIN
  * Kernel entry points
  *****************************************************************************/
 
+namespace detail::scan_by_key
+{
+
 /**
  * @brief Scan by key kernel entry point (multi-block)
  *
@@ -150,16 +153,16 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE
   using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT;
 
   // Thread block type for scanning input tiles
-  using AgentScanByKeyT =
-    AgentScanByKey<ScanByKeyPolicyT,
-                   KeysInputIteratorT,
-                   ValuesInputIteratorT,
-                   ValuesOutputIteratorT,
-                   EqualityOp,
-                   ScanOpT,
-                   InitValueT,
-                   OffsetT,
-                   AccumT>;
+  using AgentScanByKeyT = detail::scan_by_key::AgentScanByKey<
+    ScanByKeyPolicyT,
+    KeysInputIteratorT,
+    ValuesInputIteratorT,
+    ValuesOutputIteratorT,
+    EqualityOp,
+    ScanOpT,
+    InitValueT,
+    OffsetT,
+    AccumT>;
 
   // Shared memory for AgentScanByKey
   __shared__ typename AgentScanByKeyT::TempStorage temp_storage;
@@ -188,6 +191,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel(
     d_keys_prev_in[tid] = d_keys_in[tile_base - 1];
   }
 }
+} // namespace detail::scan_by_key
 
 /******************************************************************************
  * Dispatch
@@ -406,9 +410,9 @@ struct DispatchScanByKey
 
       // Log init_kernel configuration
       int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -441,7 +445,7 @@ struct DispatchScanByKey
       for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
       {
 // Log scan_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
                 "per thread\n",
                 start_tile,
@@ -449,7 +453,7 @@ struct DispatchScanByKey
                 Policy::BLOCK_THREADS,
                 (long long) stream,
                 Policy::ITEMS_PER_THREAD);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke scan_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
@@ -489,17 +493,18 @@ struct DispatchScanByKey
   {
     // Ensure kernels are instantiated.
     return Invoke<ActivePolicyT>(
-      DeviceScanByKeyInitKernel<ScanByKeyTileStateT, KeysInputIteratorT, OffsetT>,
-      DeviceScanByKeyKernel<typename PolicyHub::MaxPolicy,
-                            KeysInputIteratorT,
-                            ValuesInputIteratorT,
-                            ValuesOutputIteratorT,
-                            ScanByKeyTileStateT,
-                            EqualityOp,
-                            ScanOpT,
-                            InitValueT,
-                            OffsetT,
-                            AccumT>);
+      detail::scan_by_key::DeviceScanByKeyInitKernel<ScanByKeyTileStateT, KeysInputIteratorT, OffsetT>,
+      detail::scan_by_key::DeviceScanByKeyKernel<
+        typename PolicyHub::MaxPolicy,
+        KeysInputIteratorT,
+        ValuesInputIteratorT,
+        ValuesOutputIteratorT,
+        ScanByKeyTileStateT,
+        EqualityOp,
+        ScanOpT,
+        InitValueT,
+        OffsetT,
+        AccumT>);
   }
 
   /**
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 9d011d414ba..5690371d3fb 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -54,10 +54,14 @@
 #include <cub/util_namespace.cuh>
 #include <cub/warp/warp_merge_sort.cuh>
 
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/cmath>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 #include <type_traits>
@@ -66,6 +70,44 @@
 
 CUB_NAMESPACE_BEGIN
 
+namespace detail::segmented_sort
+{
+// Type used to index within segments within a single invocation
+using local_segment_index_t = ::cuda::std::uint32_t;
+// Type used for total number of segments and to index within segments globally
+using global_segment_offset_t = ::cuda::std::int64_t;
+
+template <typename Iterator, typename OffsetItT>
+class OffsetIteratorT : public THRUST_NS_QUALIFIER::iterator_adaptor<OffsetIteratorT<Iterator, OffsetItT>, Iterator>
+{
+public:
+  using super_t = THRUST_NS_QUALIFIER::iterator_adaptor<OffsetIteratorT<Iterator, OffsetItT>, Iterator>;
+
+  OffsetIteratorT() = default;
+
+  _CCCL_HOST_DEVICE OffsetIteratorT(const Iterator& it, OffsetItT offset_it)
+      : super_t(it)
+      , offset_it(offset_it)
+  {}
+
+  // befriend thrust::iterator_core_access to allow it access to the private interface below
+  friend class THRUST_NS_QUALIFIER::iterator_core_access;
+
+private:
+  OffsetItT offset_it;
+
+  _CCCL_HOST_DEVICE typename super_t::reference dereference() const
+  {
+    return *(this->base() + (*offset_it));
+  }
+};
+
+template <typename Iterator, typename OffsetItT>
+_CCCL_HOST_DEVICE OffsetIteratorT<Iterator, OffsetItT> make_offset_iterator(const Iterator& it, OffsetItT offset_it)
+{
+  return OffsetIteratorT<Iterator, OffsetItT>{it, offset_it};
+}
+
 /**
  * @brief Fallback kernel, in case there's not enough segments to
  *        take advantage of partitioning.
@@ -117,10 +159,10 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortFallbackKernel(
     const KeyT* d_keys_in_orig,
     KeyT* d_keys_out_orig,
-    cub::detail::device_double_buffer<KeyT> d_keys_double_buffer,
+    device_double_buffer<KeyT> d_keys_double_buffer,
     const ValueT* d_values_in_orig,
     ValueT* d_values_out_orig,
-    cub::detail::device_double_buffer<ValueT> d_values_double_buffer,
+    device_double_buffer<ValueT> d_values_double_buffer,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets)
 {
@@ -128,10 +170,10 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
   using MediumPolicyT       = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT;
 
-  const unsigned int segment_id = blockIdx.x;
-  OffsetT segment_begin         = d_begin_offsets[segment_id];
-  OffsetT segment_end           = d_end_offsets[segment_id];
-  OffsetT num_items             = segment_end - segment_begin;
+  const auto segment_id = static_cast<local_segment_index_t>(blockIdx.x);
+  OffsetT segment_begin = d_begin_offsets[segment_id];
+  OffsetT segment_end   = d_end_offsets[segment_id];
+  OffsetT num_items     = segment_end - segment_begin;
 
   if (num_items <= 0)
   {
@@ -139,11 +181,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   }
 
   using AgentSegmentedRadixSortT =
-    cub::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
+    radix_sort::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
 
   using WarpReduceT = cub::WarpReduce<KeyT>;
 
-  using AgentWarpMergeSortT = AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+  using AgentWarpMergeSortT =
+    sub_warp_merge_sort::AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
 
   __shared__ union
   {
@@ -187,14 +230,14 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   {
     // Sort by a CTA with multiple reads from global memory
     int current_bit = begin_bit;
-    int pass_bits   = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+    int pass_bits   = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
-    d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
+    d_keys_double_buffer = device_double_buffer<KeyT>(
       d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
 
     if (!keys_only)
     {
-      d_values_double_buffer = cub::detail::device_double_buffer<ValueT>(
+      d_values_double_buffer = device_double_buffer<ValueT>(
         d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin);
     }
 
@@ -210,9 +253,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
 #pragma unroll 1
     while (current_bit < end_bit)
     {
-      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+      pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
-      CTA_SYNC();
+      __syncthreads();
       agent.ProcessIterative(
         current_bit,
         pass_bits,
@@ -291,11 +334,11 @@ template <bool IS_DESCENDING,
           typename OffsetT>
 __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
-    unsigned int small_segments,
-    unsigned int medium_segments,
-    unsigned int medium_blocks,
-    const unsigned int* d_small_segments_indices,
-    const unsigned int* d_medium_segments_indices,
+    local_segment_index_t small_segments,
+    local_segment_index_t medium_segments,
+    local_segment_index_t medium_blocks,
+    const local_segment_index_t* d_small_segments_indices,
+    const local_segment_index_t* d_medium_segments_indices,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
     const ValueT* d_values_in,
@@ -303,25 +346,30 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets)
 {
-  const unsigned int tid = threadIdx.x;
-  const unsigned int bid = blockIdx.x;
+  using local_segment_index_t = local_segment_index_t;
+
+  const local_segment_index_t tid = threadIdx.x;
+  const local_segment_index_t bid = blockIdx.x;
 
   using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
   using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
   using MediumPolicyT         = typename SmallAndMediumPolicyT::MediumPolicyT;
   using SmallPolicyT          = typename SmallAndMediumPolicyT::SmallPolicyT;
 
-  constexpr int threads_per_medium_segment = MediumPolicyT::WARP_THREADS;
-  constexpr int threads_per_small_segment  = SmallPolicyT::WARP_THREADS;
+  constexpr auto threads_per_medium_segment = static_cast<local_segment_index_t>(MediumPolicyT::WARP_THREADS);
+  constexpr auto threads_per_small_segment  = static_cast<local_segment_index_t>(SmallPolicyT::WARP_THREADS);
 
-  using MediumAgentWarpMergeSortT = AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+  using MediumAgentWarpMergeSortT =
+    sub_warp_merge_sort::AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
 
-  using SmallAgentWarpMergeSortT = AgentSubWarpSort<IS_DESCENDING, SmallPolicyT, KeyT, ValueT, OffsetT>;
+  using SmallAgentWarpMergeSortT =
+    sub_warp_merge_sort::AgentSubWarpSort<IS_DESCENDING, SmallPolicyT, KeyT, ValueT, OffsetT>;
 
   constexpr auto segments_per_medium_block =
-    static_cast<unsigned int>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
+    static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
 
-  constexpr auto segments_per_small_block = static_cast<unsigned int>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+  constexpr auto segments_per_small_block =
+    static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
 
   __shared__ union
   {
@@ -332,12 +380,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
 
   if (bid < medium_blocks)
   {
-    const unsigned int sid_within_block  = tid / threads_per_medium_segment;
-    const unsigned int medium_segment_id = bid * segments_per_medium_block + sid_within_block;
+    const local_segment_index_t sid_within_block  = tid / threads_per_medium_segment;
+    const local_segment_index_t medium_segment_id = bid * segments_per_medium_block + sid_within_block;
 
     if (medium_segment_id < medium_segments)
     {
-      const unsigned int global_segment_id = d_medium_segments_indices[medium_segment_id];
+      const local_segment_index_t global_segment_id = d_medium_segments_indices[medium_segment_id];
 
       const OffsetT segment_begin = d_begin_offsets[global_segment_id];
       const OffsetT segment_end   = d_end_offsets[global_segment_id];
@@ -353,12 +401,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
   }
   else
   {
-    const unsigned int sid_within_block = tid / threads_per_small_segment;
-    const unsigned int small_segment_id = (bid - medium_blocks) * segments_per_small_block + sid_within_block;
+    const local_segment_index_t sid_within_block = tid / threads_per_small_segment;
+    const local_segment_index_t small_segment_id = (bid - medium_blocks) * segments_per_small_block + sid_within_block;
 
     if (small_segment_id < small_segments)
     {
-      const unsigned int global_segment_id = d_small_segments_indices[small_segment_id];
+      const local_segment_index_t global_segment_id = d_small_segments_indices[small_segment_id];
 
       const OffsetT segment_begin = d_begin_offsets[global_segment_id];
       const OffsetT segment_end   = d_end_offsets[global_segment_id];
@@ -410,35 +458,36 @@ template <bool IS_DESCENDING,
           typename OffsetT>
 __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge(
-    const unsigned int* d_segments_indices,
+    const local_segment_index_t* d_segments_indices,
     const KeyT* d_keys_in_orig,
     KeyT* d_keys_out_orig,
-    cub::detail::device_double_buffer<KeyT> d_keys_double_buffer,
+    device_double_buffer<KeyT> d_keys_double_buffer,
     const ValueT* d_values_in_orig,
     ValueT* d_values_out_orig,
-    cub::detail::device_double_buffer<ValueT> d_values_double_buffer,
+    device_double_buffer<ValueT> d_values_double_buffer,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets)
 {
-  using ActivePolicyT       = typename ChainedPolicyT::ActivePolicy;
-  using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+  using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
+  using LargeSegmentPolicyT   = typename ActivePolicyT::LargeSegmentPolicy;
+  using local_segment_index_t = local_segment_index_t;
 
   constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD;
 
   using AgentSegmentedRadixSortT =
-    cub::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
+    radix_sort::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
 
   __shared__ typename AgentSegmentedRadixSortT::TempStorage storage;
 
-  const unsigned int bid = blockIdx.x;
+  const local_segment_index_t bid = blockIdx.x;
 
   constexpr int begin_bit = 0;
   constexpr int end_bit   = sizeof(KeyT) * 8;
 
-  const unsigned int global_segment_id = d_segments_indices[bid];
-  const OffsetT segment_begin          = d_begin_offsets[global_segment_id];
-  const OffsetT segment_end            = d_end_offsets[global_segment_id];
-  const OffsetT num_items              = segment_end - segment_begin;
+  const local_segment_index_t global_segment_id = d_segments_indices[bid];
+  const OffsetT segment_begin                   = d_begin_offsets[global_segment_id];
+  const OffsetT segment_end                     = d_end_offsets[global_segment_id];
+  const OffsetT num_items                       = segment_end - segment_begin;
 
   constexpr bool keys_only = std::is_same<ValueT, NullType>::value;
   AgentSegmentedRadixSortT agent(num_items, storage);
@@ -461,14 +510,14 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   {
     // Sort reading global memory multiple times
     int current_bit = begin_bit;
-    int pass_bits   = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+    int pass_bits   = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
-    d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
+    d_keys_double_buffer = device_double_buffer<KeyT>(
       d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
 
     if (!keys_only)
     {
-      d_values_double_buffer = cub::detail::device_double_buffer<ValueT>(
+      d_values_double_buffer = device_double_buffer<ValueT>(
         d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin);
     }
 
@@ -484,9 +533,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
 #pragma unroll 1
     while (current_bit < end_bit)
     {
-      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+      pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
-      CTA_SYNC();
+      __syncthreads();
       agent.ProcessIterative(
         current_bit,
         pass_bits,
@@ -522,33 +571,35 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
   int num_segments,
   KeyT* d_current_keys,
   KeyT* d_final_keys,
-  detail::device_double_buffer<KeyT> d_keys_double_buffer,
+  device_double_buffer<KeyT> d_keys_double_buffer,
   ValueT* d_current_values,
   ValueT* d_final_values,
-  detail::device_double_buffer<ValueT> d_values_double_buffer,
+  device_double_buffer<ValueT> d_values_double_buffer,
   BeginOffsetIteratorT d_begin_offsets,
   EndOffsetIteratorT d_end_offsets,
-  unsigned int* group_sizes,
-  unsigned int* large_and_medium_segments_indices,
-  unsigned int* small_segments_indices,
+  local_segment_index_t* group_sizes,
+  local_segment_index_t* large_and_medium_segments_indices,
+  local_segment_index_t* small_segments_indices,
   cudaStream_t stream)
 {
+  using local_segment_index_t = local_segment_index_t;
+
   cudaError error = cudaSuccess;
 
-  const unsigned int large_segments = group_sizes[0];
+  const local_segment_index_t large_segments = group_sizes[0];
 
   if (large_segments > 0)
   {
     // One CTA per segment
-    const unsigned int blocks_in_grid = large_segments;
+    const local_segment_index_t blocks_in_grid = large_segments;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(blocks_in_grid),
             LargeSegmentPolicyT::BLOCK_THREADS,
             (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
       blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream)
@@ -571,32 +622,34 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = CubDebug(detail::DebugSyncStream(stream));
+    error = CubDebug(DebugSyncStream(stream));
     if (cudaSuccess != error)
     {
       return error;
     }
   }
 
-  const unsigned int small_segments  = group_sizes[1];
-  const unsigned int medium_segments = static_cast<unsigned int>(num_segments) - (large_segments + small_segments);
+  const local_segment_index_t small_segments = group_sizes[1];
+  const local_segment_index_t medium_segments =
+    static_cast<local_segment_index_t>(num_segments) - (large_segments + small_segments);
 
-  const unsigned int small_blocks = ::cuda::ceil_div(small_segments, SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+  const local_segment_index_t small_blocks =
+    ::cuda::ceil_div(small_segments, SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
 
-  const unsigned int medium_blocks =
+  const local_segment_index_t medium_blocks =
     ::cuda::ceil_div(medium_segments, SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
 
-  const unsigned int small_and_medium_blocks_in_grid = small_blocks + medium_blocks;
+  const local_segment_index_t small_and_medium_blocks_in_grid = small_blocks + medium_blocks;
 
   if (small_and_medium_blocks_in_grid)
   {
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(small_and_medium_blocks_in_grid),
             SmallAndMediumPolicyT::BLOCK_THREADS,
             (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
       small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream)
@@ -621,7 +674,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = CubDebug(detail::DebugSyncStream(stream));
+    error = CubDebug(DebugSyncStream(stream));
     if (cudaSuccess != error)
     {
       return error;
@@ -646,18 +699,18 @@ template <typename ChainedPolicyT,
 __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel(
   LargeKernelT large_kernel,
   SmallKernelT small_kernel,
-  int num_segments,
+  local_segment_index_t num_segments,
   KeyT* d_current_keys,
   KeyT* d_final_keys,
-  detail::device_double_buffer<KeyT> d_keys_double_buffer,
+  device_double_buffer<KeyT> d_keys_double_buffer,
   ValueT* d_current_values,
   ValueT* d_final_values,
-  detail::device_double_buffer<ValueT> d_values_double_buffer,
+  device_double_buffer<ValueT> d_values_double_buffer,
   BeginOffsetIteratorT d_begin_offsets,
   EndOffsetIteratorT d_end_offsets,
-  unsigned int* group_sizes,
-  unsigned int* large_and_medium_segments_indices,
-  unsigned int* small_segments_indices)
+  local_segment_index_t* group_sizes,
+  local_segment_index_t* large_and_medium_segments_indices,
+  local_segment_index_t* small_segments_indices)
 {
   using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
   using LargeSegmentPolicyT   = typename ActivePolicyT::LargeSegmentPolicy;
@@ -672,27 +725,30 @@ __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContin
   //
   // Due to (4, 5), we can't pass the user-provided stream in the continuation.
   // Due to (1, 2, 3) it's safe to pass the main stream.
-  cudaError_t error = DeviceSegmentedSortContinuation<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
-    large_kernel,
-    small_kernel,
-    num_segments,
-    d_current_keys,
-    d_final_keys,
-    d_keys_double_buffer,
-    d_current_values,
-    d_final_values,
-    d_values_double_buffer,
-    d_begin_offsets,
-    d_end_offsets,
-    group_sizes,
-    large_and_medium_segments_indices,
-    small_segments_indices,
-    0); // always launching on the main stream (see motivation above)
+  cudaError_t error =
+    detail::segmented_sort::DeviceSegmentedSortContinuation<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
+      large_kernel,
+      small_kernel,
+      num_segments,
+      d_current_keys,
+      d_final_keys,
+      d_keys_double_buffer,
+      d_current_values,
+      d_final_values,
+      d_values_double_buffer,
+      d_begin_offsets,
+      d_end_offsets,
+      group_sizes,
+      large_and_medium_segments_indices,
+      small_segments_indices,
+      0); // always launching on the main stream (see motivation above)
 
   error = CubDebug(error);
 }
 #endif // CUB_RDC_ENABLED
 
+} // namespace detail::segmented_sort
+
 template <bool IS_DESCENDING,
           typename KeyT,
           typename ValueT,
@@ -702,6 +758,16 @@ template <bool IS_DESCENDING,
           typename PolicyHub = detail::segmented_sort::policy_hub<KeyT, ValueT>>
 struct DispatchSegmentedSort
 {
+  using local_segment_index_t   = detail::segmented_sort::local_segment_index_t;
+  using global_segment_offset_t = detail::segmented_sort::global_segment_offset_t;
+
+  using StreamingBeginOffsetIteratorT =
+    detail::segmented_sort::OffsetIteratorT<BeginOffsetIteratorT,
+                                            THRUST_NS_QUALIFIER::constant_iterator<global_segment_offset_t>>;
+  using StreamingEndOffsetIteratorT =
+    detail::segmented_sort::OffsetIteratorT<EndOffsetIteratorT,
+                                            THRUST_NS_QUALIFIER::constant_iterator<global_segment_offset_t>>;
+
   static constexpr int KEYS_ONLY = std::is_same<ValueT, NullType>::value;
 
   struct LargeSegmentsSelectorT
@@ -709,6 +775,7 @@ struct DispatchSegmentedSort
     OffsetT value{};
     BeginOffsetIteratorT d_offset_begin{};
     EndOffsetIteratorT d_offset_end{};
+    global_segment_offset_t base_segment_offset{};
 
     _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
     LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
@@ -717,9 +784,10 @@ struct DispatchSegmentedSort
         , d_offset_end(d_offset_end)
     {}
 
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
     {
-      const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id];
+      const OffsetT segment_size =
+        d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
       return segment_size > value;
     }
   };
@@ -729,6 +797,7 @@ struct DispatchSegmentedSort
     OffsetT value{};
     BeginOffsetIteratorT d_offset_begin{};
     EndOffsetIteratorT d_offset_end{};
+    global_segment_offset_t base_segment_offset{};
 
     _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
     SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
@@ -737,9 +806,10 @@ struct DispatchSegmentedSort
         , d_offset_end(d_offset_end)
     {}
 
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
     {
-      const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id];
+      const OffsetT segment_size =
+        d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
       return segment_size < value;
     }
   };
@@ -770,10 +840,10 @@ struct DispatchSegmentedSort
   DoubleBuffer<ValueT>& d_values;
 
   /// Number of items to sort
-  OffsetT num_items;
+  ::cuda::std::int64_t num_items;
 
   /// The number of segments that comprise the sorting data
-  int num_segments;
+  global_segment_offset_t num_segments;
 
   /**
    * Random-access input iterator to the sequence of beginning offsets of length
@@ -802,8 +872,8 @@ struct DispatchSegmentedSort
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    OffsetT num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    global_segment_offset_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     bool is_overwrite_okay,
@@ -871,9 +941,10 @@ struct DispatchSegmentedSort
         }
       }
 
-      auto large_and_medium_segments_indices = large_and_medium_partitioning_slot->create_alias<unsigned int>();
-      auto small_segments_indices            = small_partitioning_slot->create_alias<unsigned int>();
-      auto group_sizes                       = group_sizes_slot->create_alias<unsigned int>();
+      auto large_and_medium_segments_indices =
+        large_and_medium_partitioning_slot->create_alias<local_segment_index_t>();
+      auto small_segments_indices = small_partitioning_slot->create_alias<local_segment_index_t>();
+      auto group_sizes            = group_sizes_slot->create_alias<local_segment_index_t>();
 
       std::size_t three_way_partition_temp_storage_bytes{};
 
@@ -887,8 +958,13 @@ struct DispatchSegmentedSort
 
       if (partition_segments)
       {
-        large_and_medium_segments_indices.grow(num_segments);
-        small_segments_indices.grow(num_segments);
+        constexpr auto num_segments_per_invocation_limit =
+          static_cast<global_segment_offset_t>(::cuda::std::numeric_limits<int>::max());
+        auto const max_num_segments_per_invocation = static_cast<global_segment_offset_t>(
+          (::cuda::std::min)(static_cast<global_segment_offset_t>(num_segments), num_segments_per_invocation_limit));
+
+        large_and_medium_segments_indices.grow(max_num_segments_per_invocation);
+        small_segments_indices.grow(max_num_segments_per_invocation);
         group_sizes.grow(num_selected_groups);
 
         auto medium_indices_iterator =
@@ -897,12 +973,12 @@ struct DispatchSegmentedSort
         cub::DevicePartition::IfNoNVTX(
           nullptr,
           three_way_partition_temp_storage_bytes,
-          THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
+          THRUST_NS_QUALIFIER::counting_iterator<local_segment_index_t>(0),
           large_and_medium_segments_indices.get(),
           small_segments_indices.get(),
           medium_indices_iterator,
           group_sizes.get(),
-          num_segments,
+          max_num_segments_per_invocation,
           large_segments_selector,
           small_segments_selector,
           stream);
@@ -1002,20 +1078,22 @@ struct DispatchSegmentedSort
         // Partition input segments into size groups and assign specialized
         // kernels for each of them.
         error = SortWithPartitioning<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
-          DeviceSegmentedSortKernelLarge<IS_DESCENDING,
-                                         MaxPolicyT,
-                                         KeyT,
-                                         ValueT,
-                                         BeginOffsetIteratorT,
-                                         EndOffsetIteratorT,
-                                         OffsetT>,
-          DeviceSegmentedSortKernelSmall<IS_DESCENDING,
-                                         MaxPolicyT,
-                                         KeyT,
-                                         ValueT,
-                                         BeginOffsetIteratorT,
-                                         EndOffsetIteratorT,
-                                         OffsetT>,
+          detail::segmented_sort::DeviceSegmentedSortKernelLarge<
+            IS_DESCENDING,
+            MaxPolicyT,
+            KeyT,
+            ValueT,
+            StreamingBeginOffsetIteratorT,
+            StreamingEndOffsetIteratorT,
+            OffsetT>,
+          detail::segmented_sort::DeviceSegmentedSortKernelSmall<
+            IS_DESCENDING,
+            MaxPolicyT,
+            KeyT,
+            ValueT,
+            StreamingBeginOffsetIteratorT,
+            StreamingEndOffsetIteratorT,
+            OffsetT>,
           three_way_partition_temp_storage_bytes,
           d_keys_double_buffer,
           d_values_double_buffer,
@@ -1032,13 +1110,14 @@ struct DispatchSegmentedSort
         // on extra partitioning steps.
 
         error = SortWithoutPartitioning<LargeSegmentPolicyT>(
-          DeviceSegmentedSortFallbackKernel<IS_DESCENDING,
-                                            MaxPolicyT,
-                                            KeyT,
-                                            ValueT,
-                                            BeginOffsetIteratorT,
-                                            EndOffsetIteratorT,
-                                            OffsetT>,
+          detail::segmented_sort::DeviceSegmentedSortFallbackKernel<
+            IS_DESCENDING,
+            MaxPolicyT,
+            KeyT,
+            ValueT,
+            BeginOffsetIteratorT,
+            EndOffsetIteratorT,
+            OffsetT>,
           d_keys_double_buffer,
           d_values_double_buffer);
       }
@@ -1056,8 +1135,8 @@ struct DispatchSegmentedSort
     std::size_t& temp_storage_bytes,
     DoubleBuffer<KeyT>& d_keys,
     DoubleBuffer<ValueT>& d_values,
-    OffsetT num_items,
-    int num_segments,
+    ::cuda::std::int64_t num_items,
+    global_segment_offset_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     bool is_overwrite_okay,
@@ -1136,35 +1215,56 @@ private:
     LargeSegmentsSelectorT& large_segments_selector,
     SmallSegmentsSelectorT& small_segments_selector,
     cub::detail::temporary_storage::alias<std::uint8_t>& device_partition_temp_storage,
-    cub::detail::temporary_storage::alias<unsigned int>& large_and_medium_segments_indices,
-    cub::detail::temporary_storage::alias<unsigned int>& small_segments_indices,
-    cub::detail::temporary_storage::alias<unsigned int>& group_sizes)
+    cub::detail::temporary_storage::alias<local_segment_index_t>& large_and_medium_segments_indices,
+    cub::detail::temporary_storage::alias<local_segment_index_t>& small_segments_indices,
+    cub::detail::temporary_storage::alias<local_segment_index_t>& group_sizes)
   {
     cudaError_t error = cudaSuccess;
 
-    auto medium_indices_iterator =
-      THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get() + num_segments);
-
-    error = CubDebug(cub::DevicePartition::IfNoNVTX(
-      device_partition_temp_storage.get(),
-      three_way_partition_temp_storage_bytes,
-      THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
-      large_and_medium_segments_indices.get(),
-      small_segments_indices.get(),
-      medium_indices_iterator,
-      group_sizes.get(),
-      num_segments,
-      large_segments_selector,
-      small_segments_selector,
-      stream));
-    if (cudaSuccess != error)
+    constexpr global_segment_offset_t num_segments_per_invocation_limit =
+      static_cast<global_segment_offset_t>(::cuda::std::numeric_limits<int>::max());
+
+    // We repeatedly invoke the partitioning and sorting kernels until all segments are processed.
+    const global_segment_offset_t num_invocations =
+      ::cuda::ceil_div(static_cast<global_segment_offset_t>(num_segments), num_segments_per_invocation_limit);
+    for (global_segment_offset_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
     {
-      return error;
-    }
+      const global_segment_offset_t current_seg_offset = invocation_index * num_segments_per_invocation_limit;
+      const local_segment_index_t current_num_segments =
+        (invocation_index == (num_invocations - 1))
+          ? static_cast<local_segment_index_t>(num_segments - current_seg_offset)
+          : num_segments_per_invocation_limit;
+
+      large_segments_selector.base_segment_offset = current_seg_offset;
+      small_segments_selector.base_segment_offset = current_seg_offset;
+      auto current_begin_offset                   = detail::segmented_sort::make_offset_iterator(
+        d_begin_offsets, THRUST_NS_QUALIFIER::constant_iterator<global_segment_offset_t>{current_seg_offset});
+      auto current_end_offset = detail::segmented_sort::make_offset_iterator(
+        d_end_offsets, THRUST_NS_QUALIFIER::constant_iterator<global_segment_offset_t>{current_seg_offset});
+
+      auto medium_indices_iterator =
+        THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);
+
+      error = CubDebug(cub::DevicePartition::IfNoNVTX(
+        device_partition_temp_storage.get(),
+        three_way_partition_temp_storage_bytes,
+        THRUST_NS_QUALIFIER::counting_iterator<local_segment_index_t>(0),
+        large_and_medium_segments_indices.get(),
+        small_segments_indices.get(),
+        medium_indices_iterator,
+        group_sizes.get(),
+        current_num_segments,
+        large_segments_selector,
+        small_segments_selector,
+        stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
 
-    // The device path is only used (and only compiles) when CDP is enabled.
-    // It's defined in a macro since we can't put `#ifdef`s inside of
-    // `NV_IF_TARGET`.
+      // The device path is only used (and only compiles) when CDP is enabled.
+      // It's defined in a macro since we can't put `#ifdef`s inside of
+      // `NV_IF_TARGET`.
 #ifndef CUB_RDC_ENABLED
 
 #  define CUB_TEMP_DEVICE_CODE
@@ -1175,25 +1275,25 @@ private:
     error =                                                                    \
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \
         .doit(                                                                 \
-          DeviceSegmentedSortContinuationKernel<                               \
+          detail::segmented_sort::DeviceSegmentedSortContinuationKernel<       \
             typename PolicyHub::MaxPolicy,                                     \
             LargeKernelT,                                                      \
             SmallKernelT,                                                      \
             KeyT,                                                              \
             ValueT,                                                            \
-            BeginOffsetIteratorT,                                              \
-            EndOffsetIteratorT>,                                               \
+            StreamingBeginOffsetIteratorT,                                     \
+            StreamingEndOffsetIteratorT>,                                      \
           large_kernel,                                                        \
           small_kernel,                                                        \
-          num_segments,                                                        \
+          current_num_segments,                                                \
           d_keys.Current(),                                                    \
           GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),       \
           d_keys_double_buffer,                                                \
           d_values.Current(),                                                  \
           GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),   \
           d_values_double_buffer,                                              \
-          d_begin_offsets,                                                     \
-          d_end_offsets,                                                       \
+          current_begin_offset,                                                \
+          current_end_offset,                                                  \
           group_sizes.get(),                                                   \
           large_and_medium_segments_indices.get(),                             \
           small_segments_indices.get());                                       \
@@ -1212,16 +1312,16 @@ private:
 
 #endif // CUB_RDC_ENABLED
 
-    // Clang format mangles some of this NV_IF_TARGET block
-    // clang-format off
+      // Clang format mangles some of this NV_IF_TARGET block
+      // clang-format off
     NV_IF_TARGET(
       NV_IS_HOST,
       (
-        unsigned int h_group_sizes[num_selected_groups];
+        local_segment_index_t h_group_sizes[num_selected_groups];
         error = CubDebug(cudaMemcpyAsync(h_group_sizes,
                                              group_sizes.get(),
                                              num_selected_groups *
-                                               sizeof(unsigned int),
+                                               sizeof(local_segment_index_t),
                                              cudaMemcpyDeviceToHost,
                                              stream));
 
@@ -1236,27 +1336,27 @@ private:
           return error;
         }
 
-        error = DeviceSegmentedSortContinuation<LargeSegmentPolicyT,
+        error = detail::segmented_sort::DeviceSegmentedSortContinuation<LargeSegmentPolicyT,
                                                 SmallAndMediumPolicyT>(
           large_kernel,
           small_kernel,
-          num_segments,
+          current_num_segments,
           d_keys.Current(),
           GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),
           d_keys_double_buffer,
           d_values.Current(),
           GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),
           d_values_double_buffer,
-          d_begin_offsets,
-          d_end_offsets,
+          current_begin_offset,
+          current_end_offset,
           h_group_sizes,
           large_and_medium_segments_indices.get(),
           small_segments_indices.get(),
           stream);),
       // NV_IS_DEVICE:
       (CUB_TEMP_DEVICE_CODE));
-    // clang-format on
-
+      // clang-format on
+    }
 #undef CUB_TEMP_DEVICE_CODE
 
     return error;
@@ -1270,11 +1370,11 @@ private:
   {
     cudaError_t error = cudaSuccess;
 
-    const auto blocks_in_grid       = static_cast<unsigned int>(num_segments);
+    const auto blocks_in_grid       = static_cast<local_segment_index_t>(num_segments);
     constexpr auto threads_in_block = static_cast<unsigned int>(LargeSegmentPolicyT::BLOCK_THREADS);
 
 // Log kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, "
             "0, %lld>>>(), %d items per thread, bit_grain %d\n",
             blocks_in_grid,
@@ -1282,7 +1382,7 @@ private:
             (long long) stream,
             LargeSegmentPolicyT::ITEMS_PER_THREAD,
             LargeSegmentPolicyT::RADIX_BITS);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
     // Invoke fallback kernel
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index c41dfb389eb..5c370c8b0c9 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -62,10 +62,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
-{
-
-namespace select
+namespace detail::select
 {
 // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
 using per_partition_offset_t = ::cuda::std::int32_t;
@@ -231,8 +228,6 @@ struct agent_select_if_wrapper_t
                         MayAlias>::AgentSelectIf;
   };
 };
-} // namespace select
-} // namespace detail
 
 /******************************************************************************
  * Kernel entry points
@@ -329,9 +324,9 @@ template <typename ChainedPolicyT,
           bool KeepRejects,
           bool MayAlias>
 __launch_bounds__(int(
-  cub::detail::vsmem_helper_default_fallback_policy_t<
+  vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    detail::select::agent_select_if_wrapper_t<KeepRejects, MayAlias>::template agent_t,
+    agent_select_if_wrapper_t<KeepRejects, MayAlias>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
@@ -350,11 +345,11 @@ __launch_bounds__(int(
     OffsetT num_items,
     int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
-    cub::detail::vsmem_t vsmem)
+    vsmem_t vsmem)
 {
-  using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
+  using VsmemHelperT = vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    detail::select::agent_select_if_wrapper_t<KeepRejects, MayAlias>::template agent_t,
+    agent_select_if_wrapper_t<KeepRejects, MayAlias>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
@@ -381,6 +376,7 @@ __launch_bounds__(int(
   // If applicable, hints to discard modified cache lines for vsmem
   VsmemHelperT::discard_temp_storage(temp_storage);
 }
+} // namespace detail::select
 
 /******************************************************************************
  * Dispatch
@@ -660,7 +656,7 @@ struct DispatchSelectIf
         // Log scan_init_kernel configuration
         int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
                 init_grid_size,
                 INIT_KERNEL_THREADS,
@@ -693,7 +689,7 @@ struct DispatchSelectIf
         }
 
 // Log select_if_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         {
           // Get SM occupancy for select_if_kernel
           int range_select_sm_occupancy;
@@ -756,8 +752,8 @@ struct DispatchSelectIf
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
   {
     return Invoke<ActivePolicyT>(
-      DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
-      DeviceSelectSweepKernel<
+      detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+      detail::select::DeviceSelectSweepKernel<
         typename PolicyHub::MaxPolicy,
         InputIteratorT,
         FlagsInputIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index 6dc4f44aeca..24ef2845dee 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -83,8 +83,11 @@ CUB_NAMESPACE_BEGIN
  * @param[in] spmv_params
  *   SpMV input parameter bundle
  */
+_CCCL_SUPPRESS_DEPRECATED_PUSH
 template <typename AgentSpmvPolicyT, typename ValueT, typename OffsetT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams<ValueT, OffsetT> spmv_params)
+CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams<ValueT, OffsetT> spmv_params) //
+  _CCCL_SUPPRESS_DEPRECATED_POP
 {
   using VectorValueIteratorT =
     CacheModifiedInputIterator<AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT>;
@@ -132,8 +135,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams<ValueT, Offset
  *   SpMV input parameter bundle
  */
 template <typename SpmvPolicyT, typename OffsetT, typename CoordinateT, typename SpmvParamsT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params)
+CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(
+  int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params)
 {
   /// Constants
   enum
@@ -217,6 +221,7 @@ template <typename SpmvPolicyT,
           typename CoordinateT,
           bool HAS_ALPHA,
           bool HAS_BETA>
+CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
 __launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel(
   SpmvParams<ValueT, OffsetT> spmv_params,
   CoordinateT* d_tile_coordinates,
@@ -226,7 +231,9 @@ __launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES
   int num_segment_fixup_tiles)
 {
   // Spmv agent type specialization
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   using AgentSpmvT = AgentSpmv<SpmvPolicyT, ValueT, OffsetT, HAS_ALPHA, HAS_BETA>;
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   // Shared memory for AgentSpmv
   __shared__ typename AgentSpmvT::TempStorage temp_storage;
@@ -248,6 +255,7 @@ __launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES
  *   Whether the input parameter Beta is 0
  */
 template <typename ValueT, typename OffsetT, bool HAS_BETA>
+CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT, OffsetT> spmv_params)
 {
   const int row = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
@@ -298,18 +306,21 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT,
  * @param[in] tile_state
  *   Tile status interface
  */
+_CCCL_SUPPRESS_DEPRECATED_PUSH
 template <typename AgentSegmentFixupPolicyT,
           typename PairsInputIteratorT,
           typename AggregatesOutputIteratorT,
           typename OffsetT,
           typename ScanTileStateT>
+CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead")
 __launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel(
     PairsInputIteratorT d_pairs_in,
     AggregatesOutputIteratorT d_aggregates_out,
     OffsetT num_items,
     int num_tiles,
-    ScanTileStateT tile_state)
+    ScanTileStateT tile_state) //
+  _CCCL_SUPPRESS_DEPRECATED_POP
 {
   // Thread block type for reducing tiles of value segments
   using AgentSegmentFixupT =
@@ -342,7 +353,7 @@ __launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
  *   Signed integer type for global offsets
  */
 template <typename ValueT, typename OffsetT>
-struct DispatchSpmv
+struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
 {
   //---------------------------------------------------------------------
   // Constants and Types
@@ -625,12 +636,12 @@ struct DispatchSpmv
         constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS;
         const int blocks_in_grid       = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n",
                 blocks_in_grid,
                 threads_in_block,
                 (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
         error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
                   .doit(spmv_empty_matrix_kernel, spmv_params);
 
@@ -662,12 +673,12 @@ struct DispatchSpmv
         int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
         int degen_col_kernel_grid_size  = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
                 degen_col_kernel_grid_size,
                 degen_col_kernel_block_size,
                 (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke spmv_search_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -789,12 +800,12 @@ struct DispatchSpmv
 // Use separate search kernel if we have enough spmv tiles to saturate the device
 
 // Log spmv_search_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
                 search_grid_size,
                 search_block_size,
                 (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke spmv_search_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream)
@@ -815,7 +826,7 @@ struct DispatchSpmv
       }
 
 // Log spmv_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
               spmv_grid_size.x,
               spmv_grid_size.y,
@@ -824,7 +835,7 @@ struct DispatchSpmv
               (long long) stream,
               spmv_config.items_per_thread,
               spmv_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke spmv_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream)
@@ -853,7 +864,7 @@ struct DispatchSpmv
       if (num_merge_tiles > 1)
       {
 // Log segment_fixup_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                 segment_fixup_grid_size.x,
                 segment_fixup_grid_size.y,
@@ -862,7 +873,7 @@ struct DispatchSpmv
                 (long long) stream,
                 segment_fixup_config.items_per_thread,
                 segment_fixup_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke segment_fixup_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
index d4af506a6d9..4114e583d01 100644
--- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
@@ -30,9 +30,7 @@ _CCCL_SUPPRESS_DEPRECATED_POP
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
-{
-namespace reduce
+namespace detail::reduce
 {
 
 template <typename GlobalAccumT, typename PromoteToGlobalOpT, typename GlobalReductionOpT, typename FinalResultOutIteratorT>
@@ -374,8 +372,7 @@ struct dispatch_streaming_arg_reduce_t
   }
 };
 
-} // namespace reduce
-} // namespace detail
+} // namespace detail::reduce
 
 CUB_NAMESPACE_END
 
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index 2d5566d76a3..c1320d59219 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -52,10 +52,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
-{
-
-namespace three_way_partition
+namespace detail::three_way_partition
 {
 // Offset type used to instantiate the stream three-way-partition-kernel and agent to index the items within one
 // partition
@@ -131,8 +128,6 @@ public:
     }
   }
 };
-} // namespace three_way_partition
-} // namespace detail
 
 /******************************************************************************
  * Kernel entry points
@@ -231,6 +226,7 @@ DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumS
     }
   }
 }
+} // namespace detail::three_way_partition
 
 /******************************************************************************
  * Dispatch
@@ -319,7 +315,7 @@ struct DispatchThreeWayPartitionIf
 
     // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size)
     auto const max_partition_size =
-      static_cast<OffsetT>(::cuda::std::min(static_cast<uint64_t>(num_items), static_cast<uint64_t>(partition_size)));
+      static_cast<OffsetT>((::cuda::std::min)(static_cast<uint64_t>(num_items), static_cast<uint64_t>(partition_size)));
 
     // The number of partitions required to "iterate" over the total input
     auto const num_partitions =
@@ -387,12 +383,12 @@ struct DispatchThreeWayPartitionIf
       // Log three_way_partition_init_kernel configuration
       int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n",
               init_grid_size,
               INIT_KERNEL_THREADS,
               reinterpret_cast<long long>(stream));
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke three_way_partition_init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -420,7 +416,7 @@ struct DispatchThreeWayPartitionIf
       }
 
 // Log select_if_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       {
         // Get SM occupancy for select_if_kernel
         int range_select_sm_occupancy;
@@ -440,7 +436,7 @@ struct DispatchThreeWayPartitionIf
                 items_per_thread,
                 range_select_sm_occupancy);
       }
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke select_if_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream)
@@ -483,8 +479,8 @@ struct DispatchThreeWayPartitionIf
   {
     using MaxPolicyT = typename PolicyHub::MaxPolicy;
     return Invoke<ActivePolicyT>(
-      DeviceThreeWayPartitionInitKernel<ScanTileStateT, NumSelectedIteratorT>,
-      DeviceThreeWayPartitionKernel<
+      detail::three_way_partition::DeviceThreeWayPartitionInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+      detail::three_way_partition::DeviceThreeWayPartitionKernel<
         MaxPolicyT,
         InputIteratorT,
         FirstOutputIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index fa4fa80d0ef..f35e89a133f 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -53,9 +53,7 @@ _CCCL_NV_DIAG_SUPPRESS(186)
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
-{
-namespace transform
+namespace detail::transform
 {
 template <typename T>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
@@ -118,7 +116,7 @@ _CCCL_DEVICE void transform_kernel_impl(
   constexpr int block_dim = PrefetchPolicy::block_threads;
   const int tile_stride   = block_dim * num_elem_per_thread;
   const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
-  const int tile_size     = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_stride}));
+  const int tile_size     = static_cast<int>((::cuda::std::min)(num_items - offset, Offset{tile_stride}));
 
   // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
   {
@@ -329,7 +327,7 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   constexpr int block_dim = BulkCopyPolicy::block_threads;
   const int tile_stride   = block_dim * num_elem_per_thread;
   const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
-  const int tile_size     = ::cuda::std::min(num_items - offset, Offset{tile_stride});
+  const int tile_size     = (::cuda::std::min)(num_items - offset, Offset{tile_stride});
 
   const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
   if (inner_blocks)
@@ -688,10 +686,8 @@ struct dispatch_t<RequiresStableAddress,
         const int smem_size = bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(tile_size);
         if (smem_size > *max_smem)
         {
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
           // assert should be prevented by smem check in policy
-          assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory");
-#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          _CCCL_ASSERT_HOST(last_counts.elem_per_thread > 0, "min_items_per_thread exceeds available shared memory");
           return last_counts;
         }
 
@@ -729,12 +725,10 @@ struct dispatch_t<RequiresStableAddress,
     {
       return config.error;
     }
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
-    assert(config->elem_per_thread > 0);
-    assert(config->tile_size > 0);
-    assert(config->tile_size % bulk_copy_alignment == 0);
-    assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor
-#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+    _CCCL_ASSERT_HOST(config->elem_per_thread > 0, "");
+    _CCCL_ASSERT_HOST(config->tile_size > 0, "");
+    _CCCL_ASSERT_HOST(config->tile_size % bulk_copy_alignment == 0, "");
+    _CCCL_ASSERT_HOST((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0), ""); // logical xor
 
     const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{config->tile_size}));
     return ::cuda::std::make_tuple(
@@ -812,7 +806,7 @@ struct dispatch_t<RequiresStableAddress,
 
     // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16 or 2^20 elements
     const int items_per_thread_evenly_spread = static_cast<int>(
-      ::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy)));
+      (::cuda::std::min)(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy)));
 
     const int items_per_thread_clamped = ::cuda::std::clamp(
       items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
@@ -862,6 +856,5 @@ struct dispatch_t<RequiresStableAddress,
 
 #undef CUB_DETAIL_TRANSFORM_KERNEL_PTR
 };
-} // namespace transform
-} // namespace detail
+} // namespace detail::transform
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index e07084fe24a..058661420b9 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -57,6 +57,8 @@ CUB_NAMESPACE_BEGIN
  * Kernel entry points
  *****************************************************************************/
 
+namespace detail::unique_by_key
+{
 /**
  * @brief Unique by key kernel entry point (multi-block)
  *
@@ -126,7 +128,7 @@ template <typename ChainedPolicyT,
           typename EqualityOpT,
           typename OffsetT>
 __launch_bounds__(int(
-  cub::detail::vsmem_helper_default_fallback_policy_t<
+  vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT,
     AgentUniqueByKey,
     KeyInputIteratorT,
@@ -145,9 +147,9 @@ __launch_bounds__(int(
     EqualityOpT equality_op,
     OffsetT num_items,
     int num_tiles,
-    cub::detail::vsmem_t vsmem)
+    vsmem_t vsmem)
 {
-  using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
+  using VsmemHelperT = vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT,
     AgentUniqueByKey,
     KeyInputIteratorT,
@@ -176,7 +178,7 @@ __launch_bounds__(int(
   // If applicable, hints to discard modified cache lines for vsmem
   VsmemHelperT::discard_temp_storage(temp_storage);
 }
-
+} // namespace detail::unique_by_key
 /******************************************************************************
  * Dispatch
  ******************************************************************************/
@@ -333,7 +335,7 @@ struct DispatchUniqueByKey
 
     using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
       Policy,
-      AgentUniqueByKey,
+      detail::unique_by_key::AgentUniqueByKey,
       KeyInputIteratorT,
       ValueInputIteratorT,
       KeyOutputIteratorT,
@@ -396,9 +398,9 @@ struct DispatchUniqueByKey
       num_tiles          = CUB_MAX(1, num_tiles);
       int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -439,7 +441,7 @@ struct DispatchUniqueByKey
       scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
 
 // Log select_if_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       {
         // Get SM occupancy for unique_by_key_kernel
         int scan_sm_occupancy;
@@ -461,7 +463,7 @@ struct DispatchUniqueByKey
                 items_per_thread,
                 scan_sm_occupancy);
       }
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke select_if_kernel
       error =
@@ -501,8 +503,8 @@ struct DispatchUniqueByKey
   {
     // Ensure kernels are instantiated.
     return Invoke<ActivePolicyT>(
-      DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
-      DeviceUniqueByKeySweepKernel<
+      detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+      detail::unique_by_key::DeviceUniqueByKeySweepKernel<
         typename PolicyHub::MaxPolicy,
         KeyInputIteratorT,
         ValueInputIteratorT,
diff --git a/cub/cub/device/dispatch/kernels/reduce.cuh b/cub/cub/device/dispatch/kernels/reduce.cuh
index 2064d6f2a09..ca1ed19b529 100644
--- a/cub/cub/device/dispatch/kernels/reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/reduce.cuh
@@ -103,8 +103,6 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_
 {
   *d_out = block_aggregate;
 }
-} // namespace reduce
-} // namespace detail
 
 /**
  * @brief Reduce region kernel entry point (multi-block). Computes privatized
@@ -161,14 +159,14 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-                InputIteratorT,
-                AccumT*,
-                OffsetT,
-                ReductionOpT,
-                AccumT,
-                TransformOpT>;
+  using AgentReduceT = detail::reduce::AgentReduce<
+    typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+    InputIteratorT,
+    AccumT*,
+    OffsetT,
+    ReductionOpT,
+    AccumT,
+    TransformOpT>;
 
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
@@ -243,14 +241,14 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
                                        TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
-                InputIteratorT,
-                OutputIteratorT,
-                OffsetT,
-                ReductionOpT,
-                AccumT,
-                TransformOpT>;
+  using AgentReduceT = detail::reduce::AgentReduce<
+    typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+    InputIteratorT,
+    OutputIteratorT,
+    OffsetT,
+    ReductionOpT,
+    AccumT,
+    TransformOpT>;
 
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
@@ -276,5 +274,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
     detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate);
   }
 }
+} // namespace reduce
+} // namespace detail
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh
index d38676a84b5..cc3034638bc 100644
--- a/cub/cub/device/dispatch/kernels/scan.cuh
+++ b/cub/cub/device/dispatch/kernels/scan.cuh
@@ -42,6 +42,11 @@
 
 CUB_NAMESPACE_BEGIN
 
+namespace detail
+{
+namespace scan
+{
+
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
@@ -169,7 +174,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   using ScanPolicyT    = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
 
   // Thread block type for scanning input tiles
-  using AgentScanT =
+  using AgentScanT = detail::scan::
     AgentScan<ScanPolicyT, InputIteratorT, OutputIteratorT, ScanOpT, RealInitValueT, OffsetT, AccumT, ForceInclusive>;
 
   // Shared memory for AgentScan
@@ -181,4 +186,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
 }
 
+} // namespace scan
+} // namespace detail
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
index 3932ac74c68..1a06c25cb92 100644
--- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
@@ -133,7 +133,7 @@ struct policy_hub
 
   static constexpr int t_scale(int nominalItemsPerThread)
   {
-    return ::cuda::std::max(nominalItemsPerThread / NumActiveChannels / v_scale, 1);
+    return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1);
   }
 
   // SM35
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index 02bfb443fc1..41fbb2c49a4 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -610,7 +610,7 @@ struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive
 template <class ReductionOpT, class AccumT, class KeyT>
 struct policy_hub
 {
-  static constexpr int max_input_bytes      = static_cast<int>(::cuda::std::max(sizeof(KeyT), sizeof(AccumT)));
+  static constexpr int max_input_bytes      = static_cast<int>((::cuda::std::max)(sizeof(KeyT), sizeof(AccumT)));
   static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT);
 
   template <CacheLoadModifier LoadModifier>
diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
index 33771f6882f..87631d1199e 100644
--- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
@@ -236,7 +236,7 @@ struct sm90_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 template <class LengthT, class KeyT>
 struct policy_hub
 {
-  static constexpr int max_input_bytes      = static_cast<int>(::cuda::std::max(sizeof(KeyT), sizeof(LengthT)));
+  static constexpr int max_input_bytes      = static_cast<int>((::cuda::std::max)(sizeof(KeyT), sizeof(LengthT)));
   static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT);
 
   template <CacheLoadModifier LoadModifier>
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
index fc8add23a22..b3eaa4e513c 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
@@ -714,7 +714,7 @@ template <typename KeysInputIteratorT, typename AccumT, typename ValueT, typenam
 struct policy_hub
 {
   using key_t                               = value_t<KeysInputIteratorT>;
-  static constexpr int max_input_bytes      = static_cast<int>(::cuda::std::max(sizeof(key_t), sizeof(AccumT)));
+  static constexpr int max_input_bytes      = static_cast<int>((::cuda::std::max)(sizeof(key_t), sizeof(AccumT)));
   static constexpr int combined_input_bytes = static_cast<int>(sizeof(key_t) + sizeof(AccumT));
 
   struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh
index f2ae69fc091..7e134f7a63f 100644
--- a/cub/cub/grid/grid_barrier.cuh
+++ b/cub/cub/grid/grid_barrier.cuh
@@ -79,7 +79,7 @@ public:
     // Threadfence and syncthreads to make sure global writes are visible before
     // thread-0 reports in with its sync counter
     __threadfence();
-    CTA_SYNC();
+    __syncthreads();
 
     if (blockIdx.x == 0)
     {
@@ -89,7 +89,7 @@ public:
         d_vol_sync[blockIdx.x] = 1;
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Wait for everyone else to report in
       for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
@@ -100,7 +100,7 @@ public:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
 
       // Let everyone know it's safe to proceed
       for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
@@ -122,7 +122,7 @@ public:
         }
       }
 
-      CTA_SYNC();
+      __syncthreads();
     }
   }
 };
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index a5ce583a1cc..524217c70ea 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -416,7 +416,7 @@ struct CachingDeviceAllocator
     // Lock
     mutex.lock();
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog(
       "Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_);
 #endif
@@ -527,7 +527,7 @@ struct CachingDeviceAllocator
           cached_bytes[device].free -= search_key.bytes;
           cached_bytes[device].live += search_key.bytes;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
           _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with "
                   "stream %lld).\n",
                   device,
@@ -572,7 +572,7 @@ struct CachingDeviceAllocator
       if (error == cudaErrorMemoryAllocation)
       {
         // The allocation attempt failed: free all cached blocks on device and retry
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
                 device,
                 (long long) search_key.bytes,
@@ -611,7 +611,7 @@ struct CachingDeviceAllocator
           // Reduce balance and erase entry
           cached_bytes[device].free -= block_itr->bytes;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
           _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
                   "(%lld bytes) outstanding.\n",
                   device,
@@ -656,7 +656,7 @@ struct CachingDeviceAllocator
       cached_bytes[device].live += search_key.bytes;
       mutex.unlock();
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
               device,
               search_key.d_ptr,
@@ -678,7 +678,7 @@ struct CachingDeviceAllocator
     // Copy device pointer to output parameter
     *d_ptr = search_key.d_ptr;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     if (debug)
     {
       _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
@@ -761,7 +761,7 @@ struct CachingDeviceAllocator
         cached_blocks.insert(search_key);
         cached_bytes[device].free += search_key.bytes;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld "
                 "bytes), %lld live blocks outstanding. (%lld bytes)\n",
                 device,
@@ -819,7 +819,7 @@ struct CachingDeviceAllocator
         return error;
       }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld "
               "bytes), %lld live blocks (%lld bytes) outstanding.\n",
               device,
@@ -914,7 +914,7 @@ struct CachingDeviceAllocator
       cached_bytes[current_device].free -= block_bytes;
       cached_blocks.erase(begin);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
               "bytes) outstanding.\n",
               current_device,
diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh
index b1da6a03b5d..a2093ae288b 100644
--- a/cub/cub/util_arch.cuh
+++ b/cub/cub/util_arch.cuh
@@ -47,6 +47,10 @@
 #include <cub/util_macro.cuh>
 #include <cub/util_namespace.cuh>
 
+#include <cuda/cmath>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 // Legacy include; this functionality used to be defined in here.
 #include <cub/detail/detect_cuda_runtime.cuh>
 
@@ -113,27 +117,24 @@ namespace detail
 static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024;
 } // namespace detail
 
-template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T>
+template <int Nominal4ByteBlockThreads, int Nominal4ByteItemsPerThread, typename T>
 struct RegBoundScaling
 {
-  enum
-  {
-    ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
-    BLOCK_THREADS    = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
-                            ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
-  };
+  static constexpr int ITEMS_PER_THREAD =
+    (::cuda::std::max)(1, Nominal4ByteItemsPerThread * 4 / (::cuda::std::max)(4, int{sizeof(T)}));
+  static constexpr int BLOCK_THREADS = (::cuda::std::min)(
+    Nominal4ByteBlockThreads,
+    ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32);
 };
 
-template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T>
+template <int Nominal4ByteBlockThreads, int Nominal4ByteItemsPerThread, typename T>
 struct MemBoundScaling
 {
-  enum
-  {
-    ITEMS_PER_THREAD =
-      CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
-    BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
-                            ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
-  };
+  static constexpr int ITEMS_PER_THREAD = (::cuda::std::max)(
+    1, (::cuda::std::min)(Nominal4ByteItemsPerThread * 4 / int{sizeof(T)}, Nominal4ByteItemsPerThread * 2));
+  static constexpr int BLOCK_THREADS = (::cuda::std::min)(
+    Nominal4ByteBlockThreads,
+    ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32);
 };
 
 #endif // Do not document
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index a6eee36539c..d765b6374aa 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -88,7 +88,7 @@ CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
 // C++17 dialect check:
 #  ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT
 #    if _CCCL_STD_VER < 2017
-CUB_COMP_DEPR_IMPL(CUB requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+#      error CUB requires at least C++17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.
 #    endif // _CCCL_STD_VER >= 2017
 #  endif
 
diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh
index 275c915e8f2..099408897ad 100644
--- a/cub/cub/util_debug.cuh
+++ b/cub/cub/util_debug.cuh
@@ -66,22 +66,6 @@
  */
 #  define CUB_DEBUG_SYNC
 
-/**
- * @def CUB_DEBUG_HOST_ASSERTIONS
- *
- * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition
- * assertions.
- */
-#  define CUB_DEBUG_HOST_ASSERTIONS
-
-/**
- * @def CUB_DEBUG_DEVICE_ASSERTIONS
- *
- * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side
- * precondition assertions.
- */
-#  define CUB_DEBUG_DEVICE_ASSERTIONS
-
 /**
  * @def CUB_DEBUG_ALL
  *
@@ -94,80 +78,29 @@
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
-// `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only:
-
-#define CUB_DETAIL_DEBUG_LEVEL_NONE                 0
-#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY 1
-#define CUB_DETAIL_DEBUG_LEVEL_LOG                  2
-#define CUB_DETAIL_DEBUG_LEVEL_SYNC                 3
-#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS      4
-#define CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS    5
-#define CUB_DETAIL_DEBUG_LEVEL_ALL                  1000
-
-// `CUB_DEBUG_*`: User interfaces:
-
-// Extra logging, no syncs
-#ifdef CUB_DEBUG_LOG
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_LOG
-#endif
-
-// Logging + syncs
+// CUB_DEBUG_SYNC also enables CUB_DEBUG_LOG
 #ifdef CUB_DEBUG_SYNC
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_SYNC
-#endif
-
-// Logging + syncs + host assertions
-#ifdef CUB_DEBUG_HOST_ASSERTIONS
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS
-#endif
-
-// Logging + syncs + host assertions + device assertions
-#ifdef CUB_DEBUG_DEVICE_ASSERTIONS
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS
-#endif
-
-// All
-#ifdef CUB_DEBUG_ALL
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL
-#endif
-
-// Default case, no extra debugging:
-#ifndef CUB_DETAIL_DEBUG_LEVEL
-#  ifdef NDEBUG
-#    define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_NONE
-#  else
-#    define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY
+#  ifndef CUB_DEBUG_LOG
+#    define CUB_DEBUG_LOG
 #  endif
 #endif
 
-/*
- * `CUB_DETAIL_DEBUG_ENABLE_*`:
- * Internal implementation details, used for testing enabled debug features:
- */
-
-#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_LOG
-#  define CUB_DETAIL_DEBUG_ENABLE_LOG
-#endif
-
-#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_SYNC
-#  define CUB_DETAIL_DEBUG_ENABLE_SYNC
-#endif
-
-#if (CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS) \
-  || (CUB_DETAIL_DEBUG_LEVEL == CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY)
-#  define CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
-#endif
-
-#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS
-#  define CUB_DETAIL_DEBUG_ENABLE_DEVICE_ASSERTIONS
-#endif
+// CUB_DEBUG_ALL = CUB_DEBUG_LOG + CUB_DEBUG_SYNC
+#ifdef CUB_DEBUG_ALL
+#  ifndef CUB_DEBUG_LOG
+#    define CUB_DEBUG_LOG
+#  endif // CUB_DEBUG_LOG
+#  ifndef CUB_DEBUG_SYNC
+#    define CUB_DEBUG_SYNC
+#  endif // CUB_DEBUG_SYNC
+#endif // CUB_DEBUG_ALL
 
 /// CUB error reporting macro (prints error messages to stderr)
 #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
 #  define CUB_STDERR
 #endif
 
-#if defined(CUB_STDERR) || defined(CUB_DETAIL_DEBUG_ENABLE_LOG)
+#if defined(CUB_STDERR) || defined(CUB_DEBUG_LOG)
 #  include <cstdio>
 #endif
 
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index b9e4f5c25e6..add033fd030 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -47,7 +47,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/device_synchronize.cuh> // IWYU pragma: export
 #include <cub/util_debug.cuh>
 #include <cub/util_type.cuh>
 // for backward compatibility
@@ -81,7 +80,6 @@ struct policy_wrapper_t : PolicyT
   static constexpr int BLOCK_THREADS    = BLOCK_THREADS_;
   static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
 };
-} // namespace detail
 
 /**
  * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
@@ -90,6 +88,8 @@ template <typename T>
 CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel()
 {}
 
+} // namespace detail
+
 #endif // _CCCL_DOXYGEN_INVOKED
 
 /**
@@ -278,7 +278,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
   // Instantiate `EmptyKernel<void>` in both host and device code to ensure
   // it can be called.
   using EmptyKernelPtr        = void (*)();
-  EmptyKernelPtr empty_kernel = EmptyKernel<void>;
+  EmptyKernelPtr empty_kernel = detail::EmptyKernel<void>;
 
   // This is necessary for unused variable warnings in host compilers. The
   // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
@@ -437,62 +437,28 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device =
   return result;
 }
 
-/**
- * Synchronize the specified \p stream.
- */
+//! Synchronize the specified \p stream when called in host code. Otherwise, does nothing.
 CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
 {
-  cudaError_t result = cudaErrorNotSupported;
-
-  NV_IF_TARGET(NV_IS_HOST,
-               (result = CubDebug(cudaStreamSynchronize(stream));),
-               ((void) stream; result = CubDebug(cub::detail::device_synchronize());));
-
-  return result;
+  NV_IF_TARGET(
+    NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), ((void) stream; return cudaErrorNotSupported;));
 }
 
 namespace detail
 {
-
-/**
- * Same as SyncStream, but intended for use with the debug_synchronous flags
- * in device algorithms. This should not be used if synchronization is required
- * for correctness.
- *
- * If `debug_synchronous` is false, this function will immediately return
- * cudaSuccess. If true, one of the following will occur:
- *
- * If synchronization is supported by the current compilation target and
- * settings, the sync is performed and the sync result is returned.
- *
- * If syncs are not supported then no sync is performed, but a message is logged
- * via _CubLog and cudaSuccess is returned.
- */
+//! If CUB_DEBUG_SYNC is defined and this function is called from host code, a sync is performed and the
+//! sync result is returned. Otherwise, does nothing.
 CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
 {
-#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC
-
+#ifndef CUB_DEBUG_SYNC
   (void) stream;
   return cudaSuccess;
-
-#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:
-
-#  define CUB_TMP_SYNC_AVAILABLE         \
-    _CubLog("%s\n", "Synchronizing..."); \
-    return SyncStream(stream)
-
-#  define CUB_TMP_DEVICE_SYNC_UNAVAILABLE                                        \
-    (void) stream;                                                               \
-    _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
-            "device-side sync requires <sm_90, RDC, and CDPv1");                 \
-    return cudaSuccess
-
-  NV_IF_TARGET(NV_IS_HOST, (CUB_TMP_SYNC_AVAILABLE;), (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));
-
-#  undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
-#  undef CUB_TMP_SYNC_AVAILABLE
-
-#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
+#else // CUB_DEBUG_SYNC
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (_CubLog("%s", "Synchronizing...\n"); return SyncStream(stream);),
+    ((void) stream; _CubLog("%s", "WARNING: Skipping CUB debug synchronization in device code"); return cudaSuccess;));
+#endif // CUB_DEBUG_SYNC
 }
 
 /** \brief Gets whether the current device supports unified addressing */
diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh
index c58d90682e1..e95a7136f43 100644
--- a/cub/cub/util_macro.cuh
+++ b/cub/cub/util_macro.cuh
@@ -49,52 +49,40 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-#  define CUB_PREVENT_MACRO_SUBSTITUTION
-template <typename T, typename U>
-constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u)
-  -> decltype(t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u))
-{
-  return t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u);
-}
-
-template <typename T, typename U>
-constexpr _CCCL_HOST_DEVICE auto max CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u)
-  -> decltype(t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t))
-{
-  return t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t);
-}
-#  undef CUB_PREVENT_MACRO_SUBSTITUTION
-#endif
-
 #ifndef CUB_MAX
 /// Select maximum(a, b)
+/// Deprecated since [2.8]
 #  define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 #endif
 
 #ifndef CUB_MIN
 /// Select minimum(a, b)
+/// Deprecated since [2.8]
 #  define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 #endif
 
 #ifndef CUB_QUOTIENT_FLOOR
 /// Quotient of x/y rounded down to nearest integer
+/// Deprecated since [2.8]
 #  define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 #endif
 
 #ifndef CUB_QUOTIENT_CEILING
 /// Quotient of x/y rounded up to nearest integer
+/// Deprecated since [2.8]
 // FIXME(bgruber): the following computation can overflow, use cuda::ceil_div instead
 #  define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 #endif
 
 #ifndef CUB_ROUND_UP_NEAREST
 /// x rounded up to the nearest multiple of y
+/// Deprecated since [2.8]
 #  define CUB_ROUND_UP_NEAREST(x, y) (CUB_QUOTIENT_CEILING(x, y) * y)
 #endif
 
 #ifndef CUB_ROUND_DOWN_NEAREST
 /// x rounded down to the nearest multiple of y
+/// Deprecated since [2.8]
 #  define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
 #endif
 
diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh
index 9578c84319b..1a3940f6146 100644
--- a/cub/cub/util_math.cuh
+++ b/cub/cub/util_math.cuh
@@ -43,6 +43,8 @@
 #endif // no system header
 
 #include <cuda/cmath>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -66,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O
 {
   static_assert(::cuda::std::is_integral<OffsetT>::value, "OffsetT must be an integral type");
   static_assert(sizeof(OffsetT) >= 4, "OffsetT must be at least 32 bits in size");
-  auto const capped_operand_rhs = (cub::min)(rhs, ::cuda::std::numeric_limits<OffsetT>::max() - lhs);
+  auto const capped_operand_rhs = (::cuda::std::min)(rhs, ::cuda::std::numeric_limits<OffsetT>::max() - lhs);
   return lhs + capped_operand_rhs;
 }
 
@@ -74,14 +76,15 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O
 
 constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes)
 {
-  return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes));
+  return (::cuda::std::min)(nominal_4b_items_per_thread,
+                            (::cuda::std::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes));
 }
 
 template <typename T>
 constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItems(int nominal_4b_items_per_thread)
 {
-  return (cub::min)(nominal_4b_items_per_thread,
-                    (cub::max)(1, nominal_4b_items_per_thread * 4 / static_cast<int>(sizeof(T))));
+  return (::cuda::std::min)(nominal_4b_items_per_thread,
+                            (::cuda::std::max)(1, nominal_4b_items_per_thread * 4 / static_cast<int>(sizeof(T))));
 }
 
 template <typename ItemT>
@@ -89,10 +92,11 @@ constexpr _CCCL_HOST_DEVICE int Nominal8BItemsToItems(int nominal_8b_items_per_t
 {
   return sizeof(ItemT) <= 8u
          ? nominal_8b_items_per_thread
-         : (cub::min)(nominal_8b_items_per_thread,
-                      (cub::max)(1,
-                                 ((nominal_8b_items_per_thread * 8) + static_cast<int>(sizeof(ItemT)) - 1)
-                                   / static_cast<int>(sizeof(ItemT))));
+         : (::cuda::std::min)(
+             nominal_8b_items_per_thread,
+             (::cuda::std::max)(1,
+                                ((nominal_8b_items_per_thread * 8) + static_cast<int>(sizeof(ItemT)) - 1)
+                                  / static_cast<int>(sizeof(ItemT))));
 }
 
 /**
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index aa522d9576e..99beeed313e 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -52,34 +52,10 @@ CUB_NAMESPACE_BEGIN
  * Inlined PTX intrinsics
  ******************************************************************************/
 
-namespace detail
-{
-/**
- * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p
- * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
- */
-_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits)
-{
-  uint32_t ret{};
-  asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
-  return ret;
-}
-
-/**
- * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p
- * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
- */
-_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits)
-{
-  uint32_t ret{};
-  asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
-  return ret;
-}
-} // namespace detail
-
 /**
  * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
 {
   unsigned int ret;
@@ -90,6 +66,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int
 /**
  * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
 {
   unsigned int ret;
@@ -150,6 +127,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned in
 /**
  * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE void
 BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
 {
@@ -159,6 +137,7 @@ BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, u
 /**
  * \brief Three-operand add.  Returns \p x + \p y + \p z.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
 {
   asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
@@ -192,6 +171,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y
  * \endcode
  *
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index)
 {
   int ret;
@@ -204,6 +184,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned
 /**
  * Sync-threads barrier.
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count)
 {
   asm volatile("bar.sync 1, %0;" : : "r"(count));
@@ -212,6 +193,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count)
 /**
  * CTA barrier
  */
+CCCL_DEPRECATED_BECAUSE("use __syncthreads() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC()
 {
   __syncthreads();
@@ -220,6 +202,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC()
 /**
  * CTA barrier with predicate
  */
+CCCL_DEPRECATED_BECAUSE("use __syncthreads_and() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p)
 {
   return __syncthreads_and(p);
@@ -228,6 +211,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p)
 /**
  * CTA barrier with predicate
  */
+CCCL_DEPRECATED_BECAUSE("use __syncthreads_or() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p)
 {
   return __syncthreads_or(p);
@@ -236,6 +220,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p)
 /**
  * Warp barrier
  */
+CCCL_DEPRECATED_BECAUSE("use __syncwarp() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
 {
   __syncwarp(member_mask);
@@ -244,6 +229,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
 /**
  * Warp any
  */
+CCCL_DEPRECATED_BECAUSE("use __any_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask)
 {
   return __any_sync(member_mask, predicate);
@@ -252,6 +238,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_m
 /**
  * Warp any
  */
+CCCL_DEPRECATED_BECAUSE("use __all_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask)
 {
   return __all_sync(member_mask, predicate);
@@ -260,6 +247,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_m
 /**
  * Warp ballot
  */
+CCCL_DEPRECATED_BECAUSE("use __ballot_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask)
 {
   return __ballot_sync(member_mask, predicate);
@@ -292,6 +280,7 @@ SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member
 /**
  * Warp synchronous shfl_idx
  */
+CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
 SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
 {
@@ -304,6 +293,7 @@ SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_ma
 /**
  * Warp synchronous shfl_idx
  */
+CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask)
 {
   return __shfl_sync(member_mask, word, src_lane);
@@ -312,6 +302,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int
 /**
  * Floating point multiply. (Mantissa LSB rounds towards zero.)
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
 {
   float d;
@@ -322,6 +313,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
 /**
  * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
  */
+CCCL_DEPRECATED_BECAUSE("will be removed in the next major release")
 _CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c)
 {
   float d;
@@ -342,6 +334,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit()
 /**
  * \brief  Abort execution and generate an interrupt to the host CPU
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::std::terminate() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap()
 {
   asm volatile("trap;");
@@ -359,6 +352,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y,
 /**
  * \brief Returns the warp lane ID of the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_laneid() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
 {
   unsigned int ret;
@@ -370,6 +364,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
  * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not
  * correspond to a zero-based ranking within the thread block.
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_warpid() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId()
 {
   unsigned int ret;
@@ -409,6 +404,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id)
 /**
  * \brief Returns the warp lane mask of all lanes less than the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_lt() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
 {
   unsigned int ret;
@@ -419,6 +415,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
 /**
  * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_le() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
 {
   unsigned int ret;
@@ -429,6 +426,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
 /**
  * \brief Returns the warp lane mask of all lanes greater than the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_gt() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
 {
   unsigned int ret;
@@ -439,6 +437,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
 /**
  * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
  */
+CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_ge() instead")
 _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe()
 {
   unsigned int ret;
@@ -659,12 +658,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_t
 template <int LOGICAL_WARP_THREADS, typename T>
 _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
 {
-  /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-  enum
-  {
-    SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
-  };
-
   using ShuffleWord = typename UnitWord<T>::ShuffleWord;
 
   constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
@@ -674,18 +667,14 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in
   ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
 
   unsigned int shuffle_word;
-  shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[0], src_lane, SHFL_C, member_mask);
-
+  shuffle_word    = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS);
   output_alias[0] = shuffle_word;
-
 #pragma unroll
   for (int WORD = 1; WORD < WORDS; ++WORD)
   {
-    shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[WORD], src_lane, SHFL_C, member_mask);
-
+    shuffle_word       = __shfl_sync(member_mask, (unsigned int) input_alias[WORD], src_lane, LOGICAL_WARP_THREADS);
     output_alias[WORD] = shuffle_word;
   }
-
   return output;
 }
 
@@ -750,6 +739,28 @@ struct warp_matcher_t<LABEL_BITS, CUB_PTX_WARP_THREADS>
   }
 };
 
+/**
+ * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p
+ * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits)
+{
+  uint32_t ret{};
+  asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
+  return ret;
+}
+
+/**
+ * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p
+ * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits)
+{
+  uint32_t ret{};
+  asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
+  return ret;
+}
+
 } // namespace detail
 #endif // _CCCL_DOXYGEN_INVOKED
 
diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh
index f5926ce11e5..baba489c0ae 100644
--- a/cub/cub/util_vsmem.cuh
+++ b/cub/cub/util_vsmem.cuh
@@ -168,7 +168,7 @@ public:
   static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
   {
     // Ensure all threads finished using temporary storage
-    CTA_SYNC();
+    __syncthreads();
 
     const std::size_t linear_tid   = threadIdx.x;
     const std::size_t block_stride = line_size * blockDim.x;
diff --git a/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
index 5abfa7cdd2f..f874f961caa 100644
--- a/cub/cub/warp/specializations/warp_exchange_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
@@ -40,6 +40,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -273,8 +275,8 @@ public:
   WarpExchangeShfl() = delete;
 
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeShfl(TempStorage&)
-      : lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh
index aabb9e291e9..35b688f813c 100644
--- a/cub/cub/warp/specializations/warp_exchange_smem.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh
@@ -46,6 +46,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -88,8 +90,8 @@ public:
 
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeSmem(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
@@ -102,7 +104,7 @@ public:
       const int idx                  = ITEMS_PER_THREAD * lane_id + item;
       temp_storage.items_shared[idx] = input_items[item];
     }
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     for (int item = 0; item < ITEMS_PER_THREAD; item++)
     {
@@ -120,7 +122,7 @@ public:
       const int idx                  = LOGICAL_WARP_THREADS * item + lane_id;
       temp_storage.items_shared[idx] = input_items[item];
     }
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     for (int item = 0; item < ITEMS_PER_THREAD; item++)
     {
@@ -147,13 +149,13 @@ public:
     {
       if (INSERT_PADDING)
       {
-        ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+        ranks[ITEM] = (ranks[ITEM] >> LOG_SMEM_BANKS) + ranks[ITEM];
       }
 
       temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM];
     }
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -162,7 +164,7 @@ public:
 
       if (INSERT_PADDING)
       {
-        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+        item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
       }
 
       output_items[ITEM] = temp_storage.items_shared[item_offset];
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index 3e0db152123..8c4ad78d1ad 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -48,6 +48,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
 
@@ -82,8 +83,6 @@ template <class T>
 struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
 {};
 
-} // namespace detail
-
 /**
  * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned
  *        across a CUDA thread warp.
@@ -155,7 +154,7 @@ struct WarpReduceShfl
 
   /// Constructor
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceShfl(TempStorage& /*temp_storage*/)
-      : lane_id(static_cast<int>(LaneId()))
+      : lane_id(static_cast<int>(::cuda::ptx::get_sreg_laneid()))
       , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {
@@ -699,7 +698,7 @@ struct WarpReduceShfl
   _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = WARP_BALLOT(flag, member_mask);
+    int warp_flags = __ballot_sync(member_mask, flag);
 
     // Convert to tail-segmented
     if (HEAD_SEGMENTED)
@@ -708,7 +707,7 @@ struct WarpReduceShfl
     }
 
     // Mask out the bits below the current thread
-    warp_flags &= LaneMaskGe();
+    warp_flags &= ::cuda::ptx::get_sreg_lanemask_ge();
 
     // Mask of physical lanes outside the logical warp and convert to logical lanemask
     if (!IS_ARCH_WARP)
@@ -738,5 +737,11 @@ struct WarpReduceShfl
     return output;
   }
 };
+} // namespace detail
+
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpReduceShfl CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 87b38db2aa3..ade195ee6cb 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -49,8 +49,11 @@
 #include <cub/thread/thread_store.cuh>
 #include <cub/util_type.cuh>
 
-CUB_NAMESPACE_BEGIN
+#include <cuda/ptx>
 
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
 /**
  * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
  *        across a CUDA thread warp.
@@ -123,8 +126,8 @@ struct WarpReduceSmem
   /// Constructor
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceSmem(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
-      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
   {}
 
   /******************************************************************************
@@ -159,7 +162,7 @@ struct WarpReduceSmem
     // Share input through buffer
     ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Update input if peer_addend is in range
     if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
@@ -168,7 +171,7 @@ struct WarpReduceSmem
       input         = reduction_op(input, peer_addend);
     }
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
   }
@@ -222,7 +225,7 @@ struct WarpReduceSmem
   SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type<true> /*has_ballot*/)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = WARP_BALLOT(flag, member_mask);
+    int warp_flags = __ballot_sync(member_mask, flag);
 
     if (!HEAD_SEGMENTED)
     {
@@ -230,12 +233,12 @@ struct WarpReduceSmem
     }
 
     // Keep bits above the current thread.
-    warp_flags &= LaneMaskGt();
+    warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt();
 
     // Accommodate packing of multiple logical warps in a single physical warp
     if (!IS_ARCH_WARP)
     {
-      warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+      warp_flags >>= (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
     }
 
     // Find next flag
@@ -255,7 +258,7 @@ struct WarpReduceSmem
       // Share input into buffer
       ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
 
       // Update input if peer_addend is in range
       if (OFFSET + lane_id < next_flag)
@@ -264,7 +267,7 @@ struct WarpReduceSmem
         input         = reduction_op(input, peer_addend);
       }
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
     }
 
     return input;
@@ -311,12 +314,12 @@ struct WarpReduceSmem
       // Share input through buffer
       ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
 
       // Get peer from buffer
       T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
 
-      WARP_SYNC(member_mask);
+      __syncwarp(member_mask);
 
       // Share flag through buffer
       flag_storage[lane_id] = flag_status;
@@ -409,5 +412,10 @@ struct WarpReduceSmem
     return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<true>());
   }
 };
+} // namespace detail
 
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpReduceSmem CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index c3952b96b4f..402b476c4e4 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -48,8 +48,11 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
-CUB_NAMESPACE_BEGIN
+#include <cuda/ptx>
 
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
 /**
  * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
  *        across a CUDA thread warp.
@@ -116,7 +119,7 @@ struct WarpScanShfl
 
   /// Constructor
   explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanShfl(TempStorage& /*temp_storage*/)
-      : lane_id(LaneId())
+      : lane_id(::cuda::ptx::get_sreg_laneid())
       , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {
@@ -511,7 +514,7 @@ struct WarpScanShfl
     // Iterate scan steps
     int segment_first_lane = 0;
 
-// Iterate scan steps
+    // Iterate scan steps
 #pragma unroll
     for (int STEP = 0; STEP < STEPS; STEP++)
     {
@@ -540,15 +543,15 @@ struct WarpScanShfl
 
     KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
 
-    unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+    unsigned int ballot = __ballot_sync(member_mask, (pred_key != inclusive_output.key));
 
     // Mask away all lanes greater than ours
-    ballot = ballot & LaneMaskLe();
+    ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
 
     // Find index of first set bit
     int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
 
-// Iterate scan steps
+    // Iterate scan steps
 #pragma unroll
     for (int STEP = 0; STEP < STEPS; STEP++)
     {
@@ -672,5 +675,11 @@ struct WarpScanShfl
     Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
   }
 };
+} // namespace detail
+
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpScanShfl CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpScanShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh
index 90bdfbf361a..090f0f96cb5 100644
--- a/cub/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -49,8 +49,11 @@
 #include <cub/thread/thread_store.cuh>
 #include <cub/util_type.cuh>
 
-CUB_NAMESPACE_BEGIN
+#include <cuda/ptx>
 
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
 /**
  * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
  *        across a CUDA thread warp.
@@ -113,10 +116,10 @@ struct WarpScanSmem
       : temp_storage(temp_storage.Alias())
       ,
 
-      lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+      lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
       ,
 
-      member_mask(WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
+      member_mask(WarpMask<LOGICAL_WARP_THREADS>(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
   {}
 
   /******************************************************************************
@@ -132,7 +135,7 @@ struct WarpScanSmem
     // Share partial into buffer
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Update partial if addend is in range
     if (HAS_IDENTITY || (lane_id >= OFFSET))
@@ -140,7 +143,7 @@ struct WarpScanSmem
       T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
       partial  = scan_op(addend, partial);
     }
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
   }
@@ -171,7 +174,7 @@ struct WarpScanSmem
     T identity = 0;
     ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Iterate scan steps
     output = input;
@@ -226,7 +229,7 @@ struct WarpScanSmem
       ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
     }
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     return (T) ThreadLoad<LOAD_VOLATILE>(temp_storage);
   }
@@ -276,11 +279,11 @@ struct WarpScanSmem
     // Retrieve aggregate
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
   }
 
   //---------------------------------------------------------------------
@@ -307,7 +310,7 @@ struct WarpScanSmem
     // initial value unknown
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
   }
@@ -334,7 +337,7 @@ struct WarpScanSmem
     inclusive = scan_op(initial_value, inclusive);
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
     if (lane_id == 0)
@@ -364,7 +367,7 @@ struct WarpScanSmem
     // Initial value presumed to be unknown or identity (either way our padding is correct)
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive      = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
@@ -385,7 +388,7 @@ struct WarpScanSmem
     // Initial value presumed to be unknown or identity (either way our padding is correct)
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
     exclusive      = inclusive - input;
@@ -408,11 +411,11 @@ struct WarpScanSmem
     // Broadcast warp aggregate
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     // Update inclusive with initial value
     inclusive = scan_op(initial_value, inclusive);
@@ -420,7 +423,7 @@ struct WarpScanSmem
     // Get exclusive from exclusive
     ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
 
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
 
     exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
 
@@ -430,5 +433,11 @@ struct WarpScanSmem
     }
   }
 };
+} // namespace detail
+
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+using WarpScanSmem CCCL_DEPRECATED_BECAUSE(
+  "This class is considered an implementation detail and the public interface will be "
+  "removed.") = detail::WarpScanSmem<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>;
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh
index ac5c700b958..3f11129c35a 100644
--- a/cub/cub/warp/warp_load.cuh
+++ b/cub/cub/warp/warp_load.cuh
@@ -46,6 +46,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_exchange.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -438,14 +440,16 @@ public:
   //!        shared memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad()
       : temp_storage(PrivateStorage())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @brief Collective constructor using the specified memory allocation as
   //!        temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @} end member group
diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh
index 40e29322c1f..de3d311ae59 100644
--- a/cub/cub/warp/warp_merge_sort.cuh
+++ b/cub/cub/warp/warp_merge_sort.cuh
@@ -41,6 +41,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -151,8 +152,10 @@ public:
   WarpMergeSort() = delete;
 
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage)
-      : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
-      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      : BlockMergeSortStrategyT(
+          temp_storage,
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
       , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {}
 
@@ -164,7 +167,7 @@ public:
 private:
   _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const
   {
-    WARP_SYNC(member_mask);
+    __syncwarp(member_mask);
   }
 
   friend BlockMergeSortStrategyT;
diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 00440c18bdf..4b2c61e343a 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -174,8 +174,8 @@ public:
 
   /// Internal specialization.
   /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpReduce =
-    ::cuda::std::_If<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
+  using InternalWarpReduce = ::cuda::std::
+    _If<IS_POW_OF_TWO, detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>, detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh
index 0e0668709b0..6eb6a35562b 100644
--- a/cub/cub/warp/warp_scan.cuh
+++ b/cub/cub/warp/warp_scan.cuh
@@ -49,6 +49,7 @@
 #include <cub/warp/specializations/warp_scan_shfl.cuh>
 #include <cub/warp/specializations/warp_scan_smem.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -179,8 +180,8 @@ private:
 
   /// Internal specialization.
   /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpScan =
-    ::cuda::std::_If<IS_POW_OF_TWO, WarpScanShfl<T, LOGICAL_WARP_THREADS>, WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
+  using InternalWarpScan = ::cuda::std::
+    _If<IS_POW_OF_TWO, detail::WarpScanShfl<T, LOGICAL_WARP_THREADS>, detail::WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
 
   /// Shared memory storage layout type for WarpScan
   using _TempStorage = typename InternalWarpScan::TempStorage;
@@ -212,7 +213,7 @@ public:
   //!   Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+      , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
   {}
 
   //! @}  end member group
diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh
index bb99bc5965e..f0a9929e24f 100644
--- a/cub/cub/warp/warp_store.cuh
+++ b/cub/cub/warp/warp_store.cuh
@@ -45,6 +45,8 @@
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_exchange.cuh>
 
+#include <cuda/ptx>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -378,14 +380,16 @@ public:
   //!        memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore()
       : temp_storage(PrivateStorage())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @brief Collective constructor using the specified memory allocation as
   //!        temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , linear_tid(
+          IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
   {}
 
   //! @}  end member group
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index c86d24754de..5a093526edd 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -260,7 +260,7 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
     )
     cub_clone_target_properties(${test_target} ${cub_target})
     target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test")
-    target_compile_definitions(${test_target} PRIVATE CUB_DETAIL_DEBUG_ENABLE_SYNC)
+    target_compile_definitions(${test_target} PRIVATE CUB_DEBUG_SYNC)
 
     if ("${test_target}" MATCHES "nvtx_in_usercode")
       target_link_libraries(${test_target} PRIVATE nvtx3-cpp)
diff --git a/cub/test/catch2_large_array_sort_helper.cuh b/cub/test/catch2_large_array_sort_helper.cuh
index 6c0ed2a48be..4f80a2cd595 100644
--- a/cub/test/catch2_large_array_sort_helper.cuh
+++ b/cub/test/catch2_large_array_sort_helper.cuh
@@ -67,7 +67,7 @@ template <typename KeyType>
 class key_sort_ref_key_transform
 {
   static constexpr double max_key = static_cast<double>(::cuda::std::numeric_limits<KeyType>::max());
-  const double m_conversion;
+  double m_conversion;
   std::size_t m_num_items;
   bool m_is_descending;
 
@@ -140,7 +140,7 @@ public:
   _CCCL_HOST_DEVICE KeyType operator()(std::size_t idx) const
   {
     // The final summary may be padded, so truncate the summary_idx at the last valid idx:
-    const std::size_t summary_idx = thrust::min(m_num_summaries - 1, idx / m_unpadded_run_size);
+    const std::size_t summary_idx = cuda::std::min(m_num_summaries - 1, idx / m_unpadded_run_size);
     const KeyType key = m_is_descending ? static_cast<KeyType>((m_num_summaries - 1 - summary_idx) * m_key_conversion)
                                         : static_cast<KeyType>(summary_idx * m_key_conversion);
 
diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh
index fe188a17bf1..642b2aed4f1 100644
--- a/cub/test/catch2_radix_sort_helper.cuh
+++ b/cub/test/catch2_radix_sort_helper.cuh
@@ -41,6 +41,7 @@
 #include <thrust/sequence.h>
 
 #include <cuda/std/bit>
+#include <cuda/std/functional>
 
 #include <array>
 #include <climits>
@@ -54,7 +55,7 @@
 // Index types used for OffsetsT testing
 using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
 using all_offset_types =
-  c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;
+  c2h::type_list<cuda::std::int64_t, cuda::std::uint64_t, cuda::std::int32_t, cuda::std::uint32_t>;
 
 // Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
 template <typename OffsetT, OffsetT Step>
@@ -62,35 +63,13 @@ struct segment_iterator
 {
   OffsetT last = 0;
 
-  segment_iterator(OffsetT last1)
+  segment_iterator(std::int64_t last1)
       : last{last1}
   {}
 
-  __host__ __device__ OffsetT operator()(OffsetT x) const
+  __host__ __device__ OffsetT operator()(std::int64_t x) const
   {
-    switch (x)
-    {
-      case Step * 100:
-        return Step * 100 + Step / 2;
-      case Step * 200:
-        return Step * 200 + Step / 2;
-      case Step * 300:
-        return Step * 300 + Step / 2;
-      case Step * 400:
-        return Step * 400 + Step / 2;
-      case Step * 500:
-        return Step * 500 + Step / 2;
-      case Step * 600:
-        return Step * 600 + Step / 2;
-      case Step * 700:
-        return Step * 700 + Step / 2;
-      case Step * 800:
-        return Step * 800 + Step / 2;
-      case Step * 900:
-        return Step * 900 + Step / 2;
-      default:
-        return (x >= last) ? last : x * Step;
-    }
+    return (::cuda::std::min)(last, x * Step);
   }
 };
 
diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh
index 0852921bebf..f8a081a125a 100644
--- a/cub/test/catch2_segmented_sort_helper.cuh
+++ b/cub/test/catch2_segmented_sort_helper.cuh
@@ -26,7 +26,6 @@
  ******************************************************************************/
 #pragma once
 
-// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
 #include <cub/device/device_segmented_sort.cuh>
 
 #include <thrust/device_ptr.h>
@@ -38,6 +37,7 @@
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
+#include <thrust/unique.h>
 
 #include <cuda/std/limits>
 #include <cuda/std/tuple>
@@ -46,11 +46,11 @@
 
 #include <cstdio>
 
+#include "catch2_test_launch_helper.h"
 #include <c2h/catch2_test_helper.h>
 #include <c2h/cpu_timer.h>
 #include <c2h/extended_types.h>
 #include <c2h/utility.h>
-#include <catch2_test_launch_helper.h>
 #include <nv/target>
 
 #define MAKE_SEED_MOD_FUNCTION(name, xor_mask)                  \
@@ -71,6 +71,194 @@ MAKE_SEED_MOD_FUNCTION(offset_eraser, 0x3333333333333333)
 
 #undef MAKE_SEED_MOD_FUNCTION
 
+// Helper to generate a certain number of empty segments followed by equi-sized segments.
+template <typename OffsetT, typename SegmentIndexT>
+struct segment_index_to_offset_op
+{
+  SegmentIndexT num_empty_segments;
+  SegmentIndexT num_segments;
+  OffsetT segment_size;
+  OffsetT num_items;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT operator()(SegmentIndexT i)
+  {
+    if (i < num_empty_segments)
+    {
+      return 0;
+    }
+    else if (i < num_segments)
+    {
+      return segment_size * static_cast<OffsetT>(i - num_empty_segments);
+    }
+    else
+    {
+      return num_items;
+    }
+  }
+};
+
+template <typename T>
+struct mod_n
+{
+  std::size_t mod;
+
+  template <typename IndexT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T operator()(IndexT x)
+  {
+    return static_cast<T>(x % mod);
+  }
+};
+
+template <typename KeyT>
+struct short_key_verification_helper
+{
+  using key_t = KeyT;
+  // The histogram size of the keys being sorted for later verification
+  const std::int64_t max_histo_size = std::int64_t{1} << ::cuda::std::numeric_limits<key_t>::digits;
+
+  // Holding the histogram of the keys being sorted for verification
+  c2h::host_vector<std::size_t> keys_histogram{};
+
+public:
+  void prepare_verification_data(const c2h::device_vector<key_t>& in_keys)
+  {
+    c2h::host_vector<key_t> h_in{in_keys};
+    keys_histogram = c2h::host_vector<std::size_t>(max_histo_size, 0);
+    for (const auto& key : h_in)
+    {
+      keys_histogram[key]++;
+    }
+  }
+
+  void verify_sorted(const c2h::device_vector<key_t>& out_keys) const
+  {
+    // Verify keys are sorted next to each other
+    auto count = thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>());
+    REQUIRE(count <= max_histo_size);
+
+    // Verify keys are sorted using prior histogram computation
+    auto index_it = thrust::make_counting_iterator(std::size_t{0});
+    c2h::device_vector<key_t> unique_keys_out(count);
+    c2h::device_vector<std::size_t> unique_indexes_out(count);
+    thrust::unique_by_key_copy(
+      c2h::device_policy,
+      out_keys.cbegin(),
+      out_keys.cend(),
+      index_it,
+      unique_keys_out.begin(),
+      unique_indexes_out.begin());
+
+    for (int i = 0; i < count; i++)
+    {
+      auto const next_end = (i == count - 1) ? out_keys.size() : unique_indexes_out[i + 1];
+      REQUIRE(keys_histogram[unique_keys_out[i]] == next_end - unique_indexes_out[i]);
+    }
+  }
+};
+
+template <typename KeyT>
+class segmented_verification_helper
+{
+private:
+  using key_t = KeyT;
+  const std::size_t sequence_length{};
+
+  // Analytically computes the histogram for a segment of a series of keys: [0, 1, 2, ..., mod_n - 1, 0, 1, 2, ...].
+  // `segment_end` is one-past-the-end of the segment to compute the histogram for.
+  c2h::host_vector<int> compute_histogram_of_series(std::size_t segment_offset, std::size_t segment_end) const
+  {
+    // The i-th full cycle begins after segment_offset
+    const auto start_cycle = cuda::ceil_div(segment_offset, sequence_length);
+
+    // The last full cycle ending before segment_end
+    const auto end_cycle = segment_end / sequence_length;
+
+    // Number of full cycles repeating the sequence
+    const int full_cycles = (end_cycle > start_cycle) ? static_cast<int>(end_cycle - start_cycle) : 0;
+
+    // Add contributions from full cycles
+    c2h::host_vector<int> histogram(sequence_length, full_cycles);
+
+    // Partial cycles preceding the first full cycle
+    for (std::size_t j = segment_offset; j < start_cycle * sequence_length; ++j)
+    {
+      const auto value = j % sequence_length;
+      histogram[value]++;
+    }
+
+    // Partial cycles following the last full cycle
+    for (std::size_t j = end_cycle * sequence_length; j < segment_end; ++j)
+    {
+      const auto value = j % sequence_length;
+      histogram[value]++;
+    }
+    return histogram;
+  }
+
+public:
+  segmented_verification_helper(int sequence_length)
+      : sequence_length(sequence_length)
+  {}
+
+  void prepare_input_data(c2h::device_vector<key_t>& in_keys) const
+  {
+    auto data_gen_it =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), mod_n<key_t>{sequence_length});
+    thrust::copy_n(data_gen_it, in_keys.size(), in_keys.begin());
+  }
+
+  template <typename SegmentOffsetItT>
+  void verify_sorted(c2h::device_vector<key_t>& out_keys, SegmentOffsetItT offsets, std::size_t num_segments) const
+  {
+    // The segments' end-offsets are provided by the segments' begin-offset iterator
+    auto offsets_plus_1 = offsets + 1;
+
+    // Verify keys are sorted next to each other
+    const auto count = static_cast<std::size_t>(
+      thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>()));
+    REQUIRE(count <= sequence_length * num_segments);
+
+    // // Verify keys are sorted using prior histogram computation
+    auto index_it = thrust::make_counting_iterator(std::size_t{0});
+    c2h::device_vector<key_t> unique_keys_out(count);
+    c2h::device_vector<std::size_t> unique_indexes_out(count);
+    thrust::unique_by_key_copy(
+      c2h::device_policy,
+      out_keys.cbegin(),
+      out_keys.cend(),
+      index_it,
+      unique_keys_out.begin(),
+      unique_indexes_out.begin());
+
+    // Copy the unique keys and indexes to host memory
+    c2h::host_vector<key_t> h_unique_keys_out{unique_keys_out};
+    c2h::host_vector<std::size_t> h_unique_indexes_out{unique_indexes_out};
+
+    // Verify keys are sorted using prior histogram computation
+    std::size_t uniques_index  = 0;
+    std::size_t current_offset = 0;
+    for (std::size_t seg_index = 0; seg_index < num_segments; ++seg_index)
+    {
+      const auto segment_offset    = offsets[seg_index];
+      const auto segment_end       = offsets_plus_1[seg_index];
+      const auto segment_histogram = compute_histogram_of_series(segment_offset, segment_end);
+      for (std::size_t i = 0; i < sequence_length; i++)
+      {
+        if (segment_histogram[i] != 0)
+        {
+          CAPTURE(seg_index, i, uniques_index, current_offset, count);
+          auto const next_end =
+            (uniques_index == count - 1) ? out_keys.size() : h_unique_indexes_out[uniques_index + 1];
+          REQUIRE(h_unique_keys_out[uniques_index] == i);
+          REQUIRE(next_end - h_unique_indexes_out[uniques_index] == static_cast<std::size_t>(segment_histogram[i]));
+          current_offset += segment_histogram[i];
+          uniques_index++;
+        }
+      }
+    }
+  }
+};
+
 template <typename T>
 struct unwrap_value_t_impl
 {
diff --git a/cub/test/catch2_test_block_run_length_decode.cu b/cub/test/catch2_test_block_run_length_decode.cu
index cf080e173d7..dc322e49f8a 100644
--- a/cub/test/catch2_test_block_run_length_decode.cu
+++ b/cub/test/catch2_test_block_run_length_decode.cu
@@ -104,7 +104,7 @@ private:
     BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size);
 
     // Ensure temporary shared memory can be repurposed
-    cub::CTA_SYNC();
+    __syncthreads();
 
     // Construct BlockRunLengthDecode and initialize with the run offsets
     return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets);
@@ -137,7 +137,7 @@ private:
     }
 
     // Ensure BlockLoad's temporary shared memory can be repurposed
-    cub::CTA_SYNC();
+    __syncthreads();
 
     // Load this block's tile of run lengths
     if (num_valid_items < RUNS_PER_BLOCK)
@@ -151,7 +151,7 @@ private:
     }
 
     // Ensure temporary shared memory can be repurposed
-    cub::CTA_SYNC();
+    __syncthreads();
   }
 
 public:
diff --git a/cub/test/catch2_test_debug.cu b/cub/test/catch2_test_debug.cu
index 3293ca6b7d7..a158ff9afd1 100644
--- a/cub/test/catch2_test_debug.cu
+++ b/cub/test/catch2_test_debug.cu
@@ -11,7 +11,7 @@ TEST_CASE("CubDebug returns input error", "[debug][utils]")
 
 TEST_CASE("CubDebug returns new errors", "[debug][utils]")
 {
-  cub::EmptyKernel<int><<<0, 0>>>();
+  cub::detail::EmptyKernel<int><<<0, 0>>>();
   cudaError error = cudaPeekAtLastError();
 
   REQUIRE(error != cudaSuccess);
@@ -20,7 +20,7 @@ TEST_CASE("CubDebug returns new errors", "[debug][utils]")
 
 TEST_CASE("CubDebug prefers input errors", "[debug][utils]")
 {
-  cub::EmptyKernel<int><<<0, 0>>>();
+  cub::detail::EmptyKernel<int><<<0, 0>>>();
   cudaError error = cudaPeekAtLastError();
 
   REQUIRE(error != cudaSuccess);
@@ -29,7 +29,7 @@ TEST_CASE("CubDebug prefers input errors", "[debug][utils]")
 
 TEST_CASE("CubDebug resets last error", "[debug][utils]")
 {
-  cub::EmptyKernel<int><<<0, 0>>>();
+  cub::detail::EmptyKernel<int><<<0, 0>>>();
   cudaError error = cudaPeekAtLastError();
 
   REQUIRE(error != cudaSuccess);
diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu
index 561290ce075..b58109cb657 100644
--- a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu
+++ b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu
@@ -34,7 +34,6 @@
 
 #include <cstdint>
 
-#include "catch2/catch.hpp"
 #include "catch2_test_launch_helper.h"
 #include <c2h/catch2_test_helper.h>
 
diff --git a/cub/test/catch2_test_device_segmented_sort_keys.cu b/cub/test/catch2_test_device_segmented_sort_keys.cu
index 823665ee0ef..3d392e8e8f6 100644
--- a/cub/test/catch2_test_device_segmented_sort_keys.cu
+++ b/cub/test/catch2_test_device_segmented_sort_keys.cu
@@ -24,62 +24,20 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
+
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
+#include <cub/device/device_segmented_sort.cuh>
+
 #include "catch2_radix_sort_helper.cuh"
+#include "catch2_segmented_sort_helper.cuh"
 #include <c2h/catch2_test_helper.h>
-#include <catch2_segmented_sort_helper.cuh>
 
 // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for
 // graph launch.
-
-// TODO replace with DeviceSegmentedSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
-// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types
-template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
-CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_wrapper(
-  void* d_temp_storage,
-  size_t& temp_storage_bytes,
-  const KeyT* d_keys_in,
-  KeyT* d_keys_out,
-  NumItemsT num_items,
-  NumItemsT num_segments,
-  BeginOffsetIteratorT d_begin_offsets,
-  EndOffsetIteratorT d_end_offsets,
-  bool* selector,
-  bool is_overwrite   = false,
-  cudaStream_t stream = 0)
-{
-  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-  cub::DoubleBuffer<cub::NullType> d_values;
-  auto status =
-    cub::DispatchSegmentedSort<IS_DESCENDING, KeyT, cub::NullType, NumItemsT, BeginOffsetIteratorT, EndOffsetIteratorT>::
-      Dispatch(
-        d_temp_storage,
-        temp_storage_bytes,
-        d_keys,
-        d_values,
-        num_items,
-        num_segments,
-        d_begin_offsets,
-        d_end_offsets,
-        is_overwrite,
-        stream);
-  if (status != cudaSuccess)
-  {
-    return status;
-  }
-  if (is_overwrite)
-  {
-    // Only write to selector in the DoubleBuffer invocation
-    *selector = d_keys.Current() != d_keys_out;
-  }
-  return cudaSuccess;
-}
-
 // %PARAM% TEST_LAUNCH lid 0:1
 
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper<true>, dispatch_segmented_sort_descending);
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper<false>, dispatch_segmented_sort);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedSort::StableSortKeys, stable_sort_keys);
 
 using key_types =
   c2h::type_list<bool,
@@ -220,75 +178,43 @@ C2H_TEST("DeviceSegmentedSortKeys: Unspecified segments, random keys", "[keys][s
   test_unspecified_segments_random<KeyT>(C2H_SEED(4));
 }
 
-#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
-
-// we can reuse the same structure of DeviceSegmentedRadixSortKeys for simplicity
-C2H_TEST("DeviceSegmentedSortKeys: very large num. items and num. segments",
-         "[keys][segmented][sort][device]",
-         all_offset_types)
+C2H_TEST("DeviceSegmentedSortKeys: very large number of segments", "[keys][segmented][sort][device]", all_offset_types)
 try
 {
-  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
-  using offset_t                   = c2h::get<0, TestType>;
-  constexpr std::size_t Step       = 500;
-  using segment_iterator_t         = segment_iterator<offset_t, Step>;
-  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
-  constexpr int num_key_seeds      = 1;
-  const bool is_descending         = GENERATE(false, true);
-  const bool is_overwrite          = GENERATE(false, true);
+  using key_t                        = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using segment_offset_t             = std::int64_t;
+  using offset_t                     = c2h::get<0, TestType>;
+  using segment_iterator_t           = segment_index_to_offset_op<offset_t, segment_offset_t>;
+  constexpr std::size_t segment_size = 1000000;
+  constexpr std::size_t uint32_max   = ::cuda::std::numeric_limits<std::uint32_t>::max();
   constexpr std::size_t num_items =
     (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
-  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
-  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+  constexpr segment_offset_t num_empty_segments = uint32_max;
+  const segment_offset_t num_segments           = num_empty_segments + ::cuda::ceil_div(num_items, segment_size);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments);
 
   c2h::device_vector<key_t> in_keys(num_items);
   c2h::device_vector<key_t> out_keys(num_items);
-  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
-  auto offsets =
-    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
-  auto offsets_plus_1 = offsets + 1;
-  // Allocate host/device-accessible memory to communicate the selected output buffer
-  bool* selector_ptr = nullptr;
-  if (is_overwrite)
-  {
-    REQUIRE(cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)) == cudaSuccess);
-  }
-
-  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
-  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
-  if (is_descending)
-  {
-    dispatch_segmented_sort_descending(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      offsets,
-      offsets_plus_1,
-      selector_ptr,
-      is_overwrite);
-  }
-  else
-  {
-    dispatch_segmented_sort(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      offsets,
-      offsets_plus_1,
-      selector_ptr,
-      is_overwrite);
-  }
-  if (is_overwrite)
-  {
-    if (*selector_ptr)
-    {
-      std::swap(out_keys, in_keys);
-    }
-    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
-  }
-  REQUIRE((ref_keys == out_keys) == true);
+
+  // Generate input keys
+  constexpr auto max_histo_size = 250;
+  segmented_verification_helper<key_t> verification_helper{max_histo_size};
+  verification_helper.prepare_input_data(in_keys);
+
+  auto offsets = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(std::size_t{0}),
+    segment_iterator_t{num_empty_segments, num_segments, segment_size, num_items});
+
+  stable_sort_keys(
+    thrust::raw_pointer_cast(in_keys.data()),
+    thrust::raw_pointer_cast(out_keys.data()),
+    static_cast<offset_t>(num_items),
+    static_cast<segment_offset_t>(num_segments),
+    offsets,
+    offsets + 1);
+
+  // Verify the keys are sorted correctly
+  verification_helper.verify_sorted(out_keys, offsets + num_empty_segments, num_segments - num_empty_segments);
 }
 catch (std::bad_alloc& e)
 {
@@ -299,15 +225,14 @@ C2H_TEST("DeviceSegmentedSort::SortKeys: very large segments", "[keys][segmented
 try
 {
   using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using segment_offset_t           = std::int32_t;
   using offset_t                   = c2h::get<0, TestType>;
   constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
   constexpr int num_key_seeds      = 1;
-  const bool is_descending         = GENERATE(false, true);
-  const bool is_overwrite          = GENERATE(false, true);
   constexpr std::size_t num_items =
     (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
-  const std::size_t num_segments = 2;
-  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+  const segment_offset_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments);
 
   c2h::device_vector<key_t> in_keys(num_items);
   c2h::device_vector<key_t> out_keys(num_items);
@@ -317,51 +242,22 @@ try
   offsets[1] = static_cast<offset_t>(num_items);
   offsets[2] = static_cast<offset_t>(num_items);
 
-  // Allocate host/device-accessible memory to communicate the selected output buffer
-  bool* selector_ptr = nullptr;
-  if (is_overwrite)
-  {
-    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
-  }
-  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, offsets);
-  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
-  if (is_descending)
-  {
-    dispatch_segmented_sort_descending(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      thrust::raw_pointer_cast(offsets.data()),
-      offsets.cbegin() + 1,
-      selector_ptr,
-      is_overwrite);
-  }
-  else
-  {
-    dispatch_segmented_sort(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      thrust::raw_pointer_cast(offsets.data()),
-      offsets.cbegin() + 1,
-      selector_ptr,
-      is_overwrite);
-  }
-  if (is_overwrite)
-  {
-    if (*selector_ptr)
-    {
-      std::swap(out_keys, in_keys);
-    }
-    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
-  }
-  REQUIRE((ref_keys == out_keys) == true);
+  // Prepare information for later verification
+  short_key_verification_helper<key_t> verification_helper{};
+  verification_helper.prepare_verification_data(in_keys);
+
+  stable_sort_keys(
+    thrust::raw_pointer_cast(in_keys.data()),
+    thrust::raw_pointer_cast(out_keys.data()),
+    static_cast<offset_t>(num_items),
+    static_cast<segment_offset_t>(num_segments),
+    thrust::raw_pointer_cast(offsets.data()),
+    offsets.cbegin() + 1);
+
+  // Verify the keys are sorted correctly
+  verification_helper.verify_sorted(out_keys);
 }
 catch (std::bad_alloc& e)
 {
   std::cerr << "Skipping segmented sort test, insufficient GPU memory. " << e.what() << "\n";
 }
-
-#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
diff --git a/cub/test/catch2_test_device_segmented_sort_pairs.cu b/cub/test/catch2_test_device_segmented_sort_pairs.cu
index a3034608076..f24d30dbed1 100644
--- a/cub/test/catch2_test_device_segmented_sort_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_sort_pairs.cu
@@ -27,66 +27,14 @@
 #include "catch2_radix_sort_helper.cuh"
 // above header needs to be included first
 
+#include "catch2_segmented_sort_helper.cuh"
 #include <c2h/catch2_test_helper.h>
-#include <catch2_segmented_sort_helper.cuh>
 
 // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for
 // graph launch.
-
-// TODO replace with DeviceSegmentedSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
-// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types
-template <bool IS_DESCENDING,
-          typename KeyT,
-          typename ValueT,
-          typename BeginOffsetIteratorT,
-          typename EndOffsetIteratorT,
-          typename NumItemsT>
-CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_pairs_wrapper(
-  void* d_temp_storage,
-  size_t& temp_storage_bytes,
-  const KeyT* d_keys_in,
-  KeyT* d_keys_out,
-  const ValueT* d_values_in,
-  ValueT* d_values_out,
-  NumItemsT num_items,
-  NumItemsT num_segments,
-  BeginOffsetIteratorT d_begin_offsets,
-  EndOffsetIteratorT d_end_offsets,
-  bool* selector,
-  bool is_overwrite   = false,
-  cudaStream_t stream = 0)
-{
-  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-  auto status = cub::
-    DispatchSegmentedSort<IS_DESCENDING, KeyT, ValueT, NumItemsT, BeginOffsetIteratorT, EndOffsetIteratorT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      is_overwrite,
-      stream);
-  if (status != cudaSuccess)
-  {
-    return status;
-  }
-  if (is_overwrite)
-  {
-    // Only write to selector in the DoubleBuffer invocation
-    *selector = d_keys.Current() != d_keys_out;
-  }
-  return cudaSuccess;
-}
-
 // %PARAM% TEST_LAUNCH lid 0:1
 
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper<true>, dispatch_segmented_sort_pairs_descending);
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper<false>, dispatch_segmented_sort_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedSort::StableSortPairs, stable_sort_pairs);
 
 using pair_types =
   c2h::type_list<c2h::type_list<bool, std::uint8_t>,
@@ -251,90 +199,56 @@ C2H_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values",
   test_unspecified_segments_random<KeyT, ValueT>(C2H_SEED(4));
 }
 
-#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
-
-// we can reuse the same structure of DeviceSegmentedRadixSortPairs for simplicity
 C2H_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments",
          "[pairs][segmented][sort][device]",
          all_offset_types)
 try
 {
-  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
-  using value_t                    = cuda::std::uint8_t;
-  using offset_t                   = c2h::get<0, TestType>;
-  constexpr std::size_t Step       = 500;
-  using segment_iterator_t         = segment_iterator<offset_t, Step>;
-  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
-  constexpr int num_key_seeds      = 1;
-  constexpr int num_value_seeds    = 1;
-  const bool is_descending         = GENERATE(false, true);
-  const bool is_overwrite          = GENERATE(false, true);
+  using key_t                        = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                      = cuda::std::uint8_t;
+  using segment_offset_t             = std::int64_t;
+  using offset_t                     = c2h::get<0, TestType>;
+  using segment_iterator_t           = segment_index_to_offset_op<offset_t, segment_offset_t>;
+  constexpr std::size_t segment_size = 1000000;
+  constexpr std::size_t uint32_max   = ::cuda::std::numeric_limits<std::uint32_t>::max();
   constexpr std::size_t num_items =
     (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
-  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
-  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+  constexpr segment_offset_t num_empty_segments = uint32_max;
+  const segment_offset_t num_segments           = num_empty_segments + ::cuda::ceil_div(num_items, segment_size);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments);
 
+  // Generate input
   c2h::device_vector<key_t> in_keys(num_items);
   c2h::device_vector<value_t> in_values(num_items);
-  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
-  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+  constexpr auto max_histo_size = 250;
+  segmented_verification_helper<key_t> verification_helper{max_histo_size};
+  verification_helper.prepare_input_data(in_keys);
+  thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin());
 
   // Initialize the output vectors by copying the inputs since not all items may belong to a segment.
   c2h::device_vector<key_t> out_keys(num_items);
   c2h::device_vector<value_t> out_values(num_items);
-  auto offsets =
-    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+
+  auto offsets = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(std::size_t{0}),
+    segment_iterator_t{num_empty_segments, num_segments, segment_size, num_items});
   auto offsets_plus_1 = offsets + 1;
-  bool* selector_ptr  = nullptr;
-  if (is_overwrite)
-  {
-    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
-  }
-
-  auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1);
-  auto& ref_keys      = refs.first;
-  auto& ref_values    = refs.second;
-  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
-  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
-  if (is_descending)
-  {
-    dispatch_segmented_sort_pairs_descending(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      thrust::raw_pointer_cast(in_values.data()),
-      out_values_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      offsets,
-      offsets_plus_1,
-      selector_ptr,
-      is_overwrite);
-  }
-  else
-  {
-    dispatch_segmented_sort_pairs(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      thrust::raw_pointer_cast(in_values.data()),
-      out_values_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      offsets,
-      offsets_plus_1,
-      selector_ptr,
-      is_overwrite);
-  }
-  if (is_overwrite)
-  {
-    if (*selector_ptr)
-    {
-      std::swap(out_keys, in_keys);
-      std::swap(out_values, in_values);
-    }
-    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
-  }
-  REQUIRE(ref_keys == out_keys);
-  REQUIRE(ref_values == out_values);
+
+  stable_sort_pairs(
+    thrust::raw_pointer_cast(in_keys.data()),
+    thrust::raw_pointer_cast(out_keys.data()),
+    thrust::raw_pointer_cast(in_values.data()),
+    thrust::raw_pointer_cast(out_values.data()),
+    static_cast<offset_t>(num_items),
+    static_cast<segment_offset_t>(num_segments),
+    offsets,
+    offsets_plus_1);
+
+  // Verify the keys are sorted correctly
+  verification_helper.verify_sorted(out_keys, offsets + num_empty_segments, num_segments - num_empty_segments);
+
+  // Verify values were sorted along with the keys
+  REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin()));
 }
 catch (std::bad_alloc& e)
 {
@@ -346,82 +260,47 @@ try
 {
   using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
   using value_t                    = cuda::std::uint8_t;
+  using segment_offset_t           = std::int32_t;
   using offset_t                   = c2h::get<0, TestType>;
   constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
   constexpr int num_key_seeds      = 1;
-  constexpr int num_value_seeds    = 1;
-  const bool is_descending         = GENERATE(false, true);
-  const bool is_overwrite          = GENERATE(false, true);
   constexpr std::size_t num_items =
     (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
-  constexpr std::size_t num_segments = 2;
-  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+  constexpr segment_offset_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments);
 
   c2h::device_vector<key_t> in_keys(num_items);
   c2h::device_vector<value_t> in_values(num_items);
   c2h::device_vector<key_t> out_keys(num_items);
   c2h::gen(C2H_SEED(num_key_seeds), in_keys);
-  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+  thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin());
   c2h::device_vector<value_t> out_values(num_items);
   c2h::device_vector<offset_t> offsets(num_segments + 1);
-  offsets[0]         = 0;
-  offsets[1]         = static_cast<offset_t>(num_items);
-  offsets[2]         = static_cast<offset_t>(num_items);
-  bool* selector_ptr = nullptr;
-  if (is_overwrite)
-  {
-    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
-  }
-
-  auto refs = segmented_radix_sort_reference(
-    in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1);
-  auto& ref_keys      = refs.first;
-  auto& ref_values    = refs.second;
-  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
-  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
-  if (is_descending)
-  {
-    dispatch_segmented_sort_pairs_descending(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      thrust::raw_pointer_cast(in_values.data()),
-      out_values_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      thrust::raw_pointer_cast(offsets.data()),
-      offsets.cbegin() + 1,
-      selector_ptr,
-      is_overwrite);
-  }
-  else
-  {
-    dispatch_segmented_sort_pairs(
-      thrust::raw_pointer_cast(in_keys.data()),
-      out_keys_ptr,
-      thrust::raw_pointer_cast(in_values.data()),
-      out_values_ptr,
-      static_cast<offset_t>(num_items),
-      static_cast<offset_t>(num_segments),
-      thrust::raw_pointer_cast(offsets.data()),
-      offsets.cbegin() + 1,
-      selector_ptr,
-      is_overwrite);
-  }
-  if (is_overwrite)
-  {
-    if (*selector_ptr)
-    {
-      std::swap(out_keys, in_keys);
-      std::swap(out_values, in_values);
-    }
-    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
-  }
-  REQUIRE(ref_keys == out_keys);
-  REQUIRE(ref_values == out_values);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+
+  // Prepare information for later verification
+  short_key_verification_helper<key_t> verification_helper{};
+  verification_helper.prepare_verification_data(in_keys);
+
+  stable_sort_pairs(
+    thrust::raw_pointer_cast(in_keys.data()),
+    thrust::raw_pointer_cast(out_keys.data()),
+    thrust::raw_pointer_cast(in_values.data()),
+    thrust::raw_pointer_cast(out_values.data()),
+    static_cast<offset_t>(num_items),
+    static_cast<segment_offset_t>(num_segments),
+    thrust::raw_pointer_cast(offsets.data()),
+    offsets.cbegin() + 1);
+
+  // Verify the keys are sorted correctly
+  verification_helper.verify_sorted(out_keys);
+
+  // Verify values were sorted along with the keys
+  REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin()));
 }
 catch (std::bad_alloc& e)
 {
   std::cerr << "Skipping segmented sort test, insufficient GPU memory. " << e.what() << "\n";
 }
-
-#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu
index cf86389d68c..6b16bde7fa9 100644
--- a/cub/test/catch2_test_vsmem.cu
+++ b/cub/test/catch2_test_vsmem.cu
@@ -33,7 +33,6 @@
 #include <cub/util_type.cuh>
 #include <cub/util_vsmem.cuh>
 
-#include "catch2/catch.hpp"
 #include "catch2_test_launch_helper.h"
 #include <c2h/catch2_test_helper.h>
 
diff --git a/cub/test/catch2_test_warp_merge_sort.cu b/cub/test/catch2_test_warp_merge_sort.cu
index 7b245ebba33..fa4f986ad64 100644
--- a/cub/test/catch2_test_warp_merge_sort.cu
+++ b/cub/test/catch2_test_warp_merge_sort.cu
@@ -88,7 +88,7 @@ __global__ void warp_merge_sort_kernel(T* in, T* out, SegmentSizeItT segment_siz
     const int idx     = thread_offset + item;
     thread_data[item] = in[idx];
   }
-  cub::WARP_SYNC(warp_sort.get_member_mask());
+  __syncwarp(warp_sort.get_member_mask());
 
   // Run merge sort test
   action(warp_sort, thread_data, valid_items, oob_default);
@@ -153,7 +153,7 @@ __global__ void warp_merge_sort_kernel(
     keys[item]    = keys_in[idx];
     values[item]  = values_in[idx];
   }
-  cub::WARP_SYNC(warp_sort.get_member_mask());
+  __syncwarp(warp_sort.get_member_mask());
 
   // Run merge sort test
   action(warp_sort, keys, values, valid_items, oob_default);
diff --git a/cub/test/insert_nested_NVTX_range_guard.h b/cub/test/insert_nested_NVTX_range_guard.h
index 56d7aad6bc1..9da6cf042b1 100644
--- a/cub/test/insert_nested_NVTX_range_guard.h
+++ b/cub/test/insert_nested_NVTX_range_guard.h
@@ -5,7 +5,7 @@
 #include <cstdio>
 #include <cstdlib>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_test_macros.hpp>
 
 #if defined(__cpp_inline_variables)
 inline thread_local bool entered = false;
diff --git a/cub/test/test_allocator.cu b/cub/test/test_allocator.cu
index 4b4723fe997..9628e936a13 100644
--- a/cub/test/test_allocator.cu
+++ b/cub/test/test_allocator.cu
@@ -105,7 +105,7 @@ int main(int argc, char** argv)
   CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0));
 
   // Run some big kernel in stream 0
-  EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+  detail::EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
 
   // Free d_999B_stream0_a
   CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
@@ -120,7 +120,7 @@ int main(int argc, char** argv)
   AssertEquals(allocator.cached_blocks.size(), 0);
 
   // Run some big kernel in stream 0
-  EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+  detail::EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
 
   // Free d_999B_stream0_b
   CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
@@ -138,7 +138,7 @@ int main(int argc, char** argv)
   AssertEquals(allocator.cached_blocks.size(), 1);
 
   // Run some big kernel in other_stream
-  EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+  detail::EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
 
   // Free d_999B_stream_other
   CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
@@ -170,7 +170,7 @@ int main(int argc, char** argv)
   AssertEquals(allocator.cached_blocks.size(), 0);
 
   // Run some big kernel in other_stream
-  EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+  detail::EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
 
   // Free d_999B_stream_other_a and d_999B_stream_other_b
   CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
@@ -388,7 +388,7 @@ int main(int argc, char** argv)
   // Prime the caching allocator and the kernel
   CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
   CubDebugExit(allocator.DeviceFree(d_1024MB));
-  cub::EmptyKernel<void><<<1, 32>>>();
+  detail::EmptyKernel<void><<<1, 32>>>();
 
   // CUDA
   cpu_timer.Start();
@@ -427,7 +427,7 @@ int main(int argc, char** argv)
   gpu_timer.Start();
   for (int i = 0; i < timing_iterations; ++i)
   {
-    cub::EmptyKernel<void><<<1, 32>>>();
+    detail::EmptyKernel<void><<<1, 32>>>();
   }
   gpu_timer.Stop();
   float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
@@ -437,7 +437,7 @@ int main(int argc, char** argv)
   for (int i = 0; i < timing_iterations; ++i)
   {
     CubDebugExit(cudaMalloc((void**) &d_1024MB, timing_bytes));
-    cub::EmptyKernel<void><<<1, 32>>>();
+    detail::EmptyKernel<void><<<1, 32>>>();
     CubDebugExit(cudaFree(d_1024MB));
   }
   gpu_timer.Stop();
@@ -448,7 +448,7 @@ int main(int argc, char** argv)
   for (int i = 0; i < timing_iterations; ++i)
   {
     CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
-    cub::EmptyKernel<void><<<1, 32>>>();
+    detail::EmptyKernel<void><<<1, 32>>>();
     CubDebugExit(allocator.DeviceFree(d_1024MB));
   }
   gpu_timer.Stop();
diff --git a/cub/test/test_device_batch_memcpy.cu b/cub/test/test_device_batch_memcpy.cu
index 2d550e32fa0..7ddb22cffc0 100644
--- a/cub/test/test_device_batch_memcpy.cu
+++ b/cub/test/test_device_batch_memcpy.cu
@@ -348,7 +348,7 @@ void RunTest(BufferOffsetT num_buffers,
 template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT>
 __global__ void TestVectorizedCopyKernel(const void* d_in, void* d_out, ByteOffsetT copy_size)
 {
-  cub::detail::VectorizedCopy<LOGICAL_WARP_SIZE, VectorT>(threadIdx.x, d_out, copy_size, d_in);
+  cub::detail::batch_memcpy::VectorizedCopy<LOGICAL_WARP_SIZE, VectorT>(threadIdx.x, d_out, copy_size, d_in);
 }
 
 struct TupleMemberEqualityOp
@@ -409,7 +409,7 @@ template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE, bool PREFER_POW2_BITS>
 __global__ void
 TestBitPackedCounterKernel(uint32_t* bins, uint32_t* increments, uint32_t* counts_out, uint32_t num_items)
 {
-  using BitPackedCounterT = cub::detail::BitPackedCounter<NUM_ITEMS, MAX_ITEM_VALUE, PREFER_POW2_BITS>;
+  using BitPackedCounterT = cub::detail::batch_memcpy::BitPackedCounter<NUM_ITEMS, MAX_ITEM_VALUE, PREFER_POW2_BITS>;
   BitPackedCounterT counter{};
   for (uint32_t i = 0; i < num_items; i++)
   {
diff --git a/cub/test/test_device_spmv.cu b/cub/test/test_device_spmv.cu
index 5a120e56e96..13dba77a594 100644
--- a/cub/test/test_device_spmv.cu
+++ b/cub/test/test_device_spmv.cu
@@ -47,6 +47,8 @@
 #include <c2h/device_policy.h>
 #include <c2h/vector.h>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 bool g_verbose = false;
 
 //==============================================================================
@@ -605,3 +607,5 @@ int main(int argc, char** argv)
 
   test_types();
 }
+
+_CCCL_SUPPRESS_DEPRECATED_POP
diff --git a/cub/test/test_grid_barrier.cu b/cub/test/test_grid_barrier.cu
index e763b48d1e2..c3f6bb5eea1 100644
--- a/cub/test/test_grid_barrier.cu
+++ b/cub/test/test_grid_barrier.cu
@@ -109,7 +109,7 @@ int main(int argc, char** argv)
   int sm_count, max_block_threads, max_sm_occupancy;
   CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
   CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
-  CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
+  CubDebugExit(MaxSmOccupancy(max_sm_occupancy, detail::EmptyKernel<void>, 32));
 
   // Compute grid size and occupancy
   int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index e61cd7cd6e2..c06d803ecb1 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -614,7 +614,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s
         case RANDOM_BIT:
         case RANDOM_MINUS_PLUS_ZERO:
           _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
-          CUB_NS_QUALIFIER::ThreadTrap();
+          cuda::std::terminate();
           break;
         case UNIFORM:
           value = 2;
@@ -656,7 +656,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool& value
         case RANDOM_BIT:
         case RANDOM_MINUS_PLUS_ZERO:
           _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
-          CUB_NS_QUALIFIER::ThreadTrap();
+          cuda::std::terminate();
           break;
         case UNIFORM:
           value = true;
@@ -697,7 +697,7 @@ InitValue(GenMode gen_mode, CUB_NS_QUALIFIER::KeyValuePair<KeyT, ValueT>& value,
       ), ( // NV_IS_DEVICE
         _CubLog("%s\n",
                 "cub::InitValue cannot generate random numbers on device.");
-        CUB_NS_QUALIFIER::ThreadTrap();
+        cuda::std::terminate();
       ));
   // clang-format on
 }
diff --git a/cub/test/thread_reduce/catch2_test_thread_reduce.cu b/cub/test/thread_reduce/catch2_test_thread_reduce.cu
index fe88ea003be..ba7342db9a5 100644
--- a/cub/test/thread_reduce/catch2_test_thread_reduce.cu
+++ b/cub/test/thread_reduce/catch2_test_thread_reduce.cu
@@ -48,6 +48,7 @@
 #include "c2h/custom_type.h"
 #include "c2h/extended_types.h"
 #include "c2h/generators.h"
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
 
 /***********************************************************************************************************************
  * Thread Reduce Wrapper Kernels
@@ -285,7 +286,7 @@ _CCCL_TEMPLATE(typename T)
 _CCCL_REQUIRES((::cuda::std::is_floating_point<T>::value))
 void verify_results(const T& expected_data, const T& test_results)
 {
-  REQUIRE(expected_data == Approx(test_results).epsilon(0.05));
+  REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
 }
 
 _CCCL_TEMPLATE(typename T)
diff --git a/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh
index f46c581ceb9..5c1fd9cab59 100644
--- a/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh
@@ -118,7 +118,7 @@ struct level_dimensions
   using level_type = Level;
 
   // Needs alignas to work around an issue with tuple
-  alignas(16) const Dimensions dims; // Unit for dimensions is implicit
+  alignas(16) Dimensions dims; // Unit for dimensions is implicit
 
   _CCCL_HOST_DEVICE constexpr level_dimensions(const Dimensions& d)
       : dims(d)
diff --git a/cudax/include/cuda/experimental/__stf/utility/memory.cuh b/cudax/include/cuda/experimental/__stf/utility/memory.cuh
index bd3c895ee16..b0f987ca63b 100644
--- a/cudax/include/cuda/experimental/__stf/utility/memory.cuh
+++ b/cudax/include/cuda/experimental/__stf/utility/memory.cuh
@@ -29,6 +29,7 @@
 
 #include <cuda/experimental/__stf/utility/cuda_safe_call.cuh>
 
+#include <algorithm>
 #include <cstdint>
 
 namespace cuda::experimental::stf
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index 3c4abf38c91..c6fb198f35b 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -8,9 +8,6 @@ find_package(Thrust ${cudax_VERSION} EXACT CONFIG
 )
 thrust_create_target(cudax.test.thrust)
 
-add_library(catch2_main STATIC catch2_helpers/catch2_main.cpp)
-target_link_libraries(catch2_main PUBLIC Catch2::Catch2)
-
 ## cudax_add_test
 #
 # Add a catch2 test executable and register it with ctest.
@@ -34,8 +31,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test
   target_link_libraries(${test_target} PRIVATE
     ${cn_target}
     cudax.test.thrust
-    Catch2::Catch2
-    catch2_main
+    Catch2::Catch2WithMain
   )
   target_compile_options(${test_target} PRIVATE
     "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh
index 661d087f3bc..c3c0f40869e 100644
--- a/cudax/test/algorithm/common.cuh
+++ b/cudax/test/algorithm/common.cuh
@@ -17,7 +17,7 @@
 #include <cuda/experimental/buffer.cuh>
 #include <cuda/experimental/memory_resource.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 inline constexpr uint8_t fill_byte    = 1;
diff --git a/cudax/test/catch2_helpers/catch2_main.cpp b/cudax/test/catch2_helpers/catch2_main.cpp
deleted file mode 100644
index 23afde17338..00000000000
--- a/cudax/test/catch2_helpers/catch2_main.cpp
+++ /dev/null
@@ -1,2 +0,0 @@
-#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do this in one cpp file
-#include <catch2/catch.hpp>
diff --git a/cudax/test/common/testing.cuh b/cudax/test/common/testing.cuh
index 965ae398e7b..65dc438c307 100644
--- a/cudax/test/common/testing.cuh
+++ b/cudax/test/common/testing.cuh
@@ -11,15 +11,39 @@
 #ifndef __COMMON_TESTING_H__
 #define __COMMON_TESTING_H__
 
+#include <cuda/__cccl_config>
+
 #include <cuda/experimental/launch.cuh>
 
 #include <exception> // IWYU pragma: keep
 #include <iostream>
 #include <sstream>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
 #include <nv/target>
 
+// workaround for error #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop'
+#if _CCCL_COMPILER(NVHPC)
+#  undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma("diag push")
+#  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma("diag pop")
+#endif
+// workaround for error
+// * MSVC14.39: #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop'
+// * MSVC14.29: internal error: assertion failed: alloc_copy_of_pending_pragma: copied pragma has source sequence entry
+//              (pragma.c, line 526 in alloc_copy_of_pending_pragma)
+// see also upstream Catch2 issue: https://github.com/catchorg/Catch2/issues/2636
+#if _CCCL_COMPILER(MSVC)
+#  undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#  undef CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
+#  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
+#endif
+
 namespace cuda::experimental::__async
 {
 }
diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu
index 4cd09badfa6..392f5fb2944 100644
--- a/cudax/test/containers/uninitialized_async_buffer.cu
+++ b/cudax/test/containers/uninitialized_async_buffer.cu
@@ -23,7 +23,6 @@
 #include <cuda/experimental/memory_resource.cuh>
 
 #include "testing.cuh"
-#include <catch2/catch.hpp>
 
 struct do_not_construct
 {
diff --git a/cudax/test/event/event_smoke.cu b/cudax/test/event/event_smoke.cu
index 58cbb638daa..45d791652a0 100644
--- a/cudax/test/event/event_smoke.cu
+++ b/cudax/test/event/event_smoke.cu
@@ -11,7 +11,7 @@
 #include <cuda/experimental/event.cuh>
 #include <cuda/experimental/stream.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 namespace
diff --git a/cudax/test/execution/env.cu b/cudax/test/execution/env.cu
index 55663ad78be..ec0985d3759 100644
--- a/cudax/test/execution/env.cu
+++ b/cudax/test/execution/env.cu
@@ -14,7 +14,7 @@
 #include <cuda/experimental/execution.cuh>
 #include <cuda/experimental/memory_resource.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 
 namespace cudax = cuda::experimental;
 using env_t     = cudax::env_t<cuda::mr::device_accessible>;
diff --git a/cudax/test/execution/policies/get_execution_policy.cu b/cudax/test/execution/policies/get_execution_policy.cu
index 11c4937f410..1315a58a2a0 100644
--- a/cudax/test/execution/policies/get_execution_policy.cu
+++ b/cudax/test/execution/policies/get_execution_policy.cu
@@ -12,7 +12,7 @@
 
 #include <cuda/experimental/execution.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 
 using cuda::experimental::execution::execution_policy;
 
diff --git a/cudax/test/execution/policies/policies.cu b/cudax/test/execution/policies/policies.cu
index 781397e2ee9..c8073cdb45f 100644
--- a/cudax/test/execution/policies/policies.cu
+++ b/cudax/test/execution/policies/policies.cu
@@ -12,7 +12,7 @@
 
 #include <cuda/experimental/execution.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 
 namespace cudax = cuda::experimental;
 
diff --git a/cudax/test/green_context/green_ctx_smoke.cu b/cudax/test/green_context/green_ctx_smoke.cu
index 01b2571e55f..b353cb2e3c1 100644
--- a/cudax/test/green_context/green_ctx_smoke.cu
+++ b/cudax/test/green_context/green_ctx_smoke.cu
@@ -11,7 +11,7 @@
 #include <cuda/experimental/green_context.cuh>
 #include <cuda/experimental/stream.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 #if CUDART_VERSION >= 12050
@@ -23,7 +23,7 @@ TEST_CASE("Green context", "[green_context]")
   }
   else
   {
-    INFO("Can create a green context")
+    INFO("Can create a green context");
     {
       {
         [[maybe_unused]] cudax::green_context ctx(cudax::devices[0]);
@@ -35,7 +35,7 @@ TEST_CASE("Green context", "[green_context]")
       }
     }
 
-    INFO("Can create streams under green context")
+    INFO("Can create streams under green context");
     {
       cudax::green_context green_ctx_dev0(cudax::devices[0]);
       cudax::stream stream_under_green_ctx(green_ctx_dev0);
@@ -47,7 +47,7 @@ TEST_CASE("Green context", "[green_context]")
         CUDAX_REQUIRE(stream_dev1.device() == 1);
       }
 
-      INFO("Can create a side stream")
+      INFO("Can create a side stream");
       {
         auto ldev1 = stream_under_green_ctx.logical_device();
         CUDAX_REQUIRE(ldev1.get_kind() == cudax::logical_device::kinds::green_context);
@@ -60,4 +60,10 @@ TEST_CASE("Green context", "[green_context]")
     }
   }
 }
+#else
+// For some reason CI fails with empty test, add a dummy test case
+TEST_CASE("Dummy test case")
+{
+  CUDAX_REQUIRE(1 == 1);
+}
 #endif // CUDART_VERSION >= 12050
diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu
index 62be4f5aac6..cf359aa3318 100644
--- a/cudax/test/hierarchy/hierarchy_smoke.cu
+++ b/cudax/test/hierarchy/hierarchy_smoke.cu
@@ -10,6 +10,7 @@
 
 #include <iostream>
 
+#include "testing.cuh"
 #include <cooperative_groups.h>
 #include <host_device.cuh>
 
@@ -380,11 +381,11 @@ TEST_CASE("On device rank calculation", "[hierarchy]")
   CUDART(cudaMalloc((void**) &ptr, 2 * 1024 * sizeof(unsigned int)));
 
   const auto config_static = cudax::block_dims<256>() & cudax::grid_dims(dim3(2, 2, 2));
-  rank_kernel<<<256, dim3(2, 2, 2)>>>(config_static, ptr);
+  rank_kernel<<<dim3(2, 2, 2), 256>>>(config_static, ptr);
   CUDART(cudaDeviceSynchronize());
-  rank_kernel_cg<<<256, dim3(2, 2, 2)>>>(config_static, ptr);
+  rank_kernel_cg<<<dim3(2, 2, 2), 256>>>(config_static, ptr);
   CUDART(cudaDeviceSynchronize());
-  rank_kernel_optimized<<<256, dim3(2, 2, 2)>>>(config_static, ptr);
+  rank_kernel_optimized<<<dim3(2, 2, 2), 256>>>(config_static, ptr);
   CUDART(cudaDeviceSynchronize());
   CUDART(cudaFree(ptr));
 }
diff --git a/cudax/test/memory_resource/any_async_resource.cu b/cudax/test/memory_resource/any_async_resource.cu
index c491c9efa21..9dbb898fc08 100644
--- a/cudax/test/memory_resource/any_async_resource.cu
+++ b/cudax/test/memory_resource/any_async_resource.cu
@@ -11,7 +11,6 @@
 #include <cuda/experimental/memory_resource.cuh>
 
 #include "test_resource.cuh"
-#include <catch2/catch.hpp>
 #include <testing.cuh>
 
 #ifndef __CUDA_ARCH__
diff --git a/cudax/test/memory_resource/device_memory_pool.cu b/cudax/test/memory_resource/device_memory_pool.cu
index 351c3d8a0ed..bbfae3385d8 100644
--- a/cudax/test/memory_resource/device_memory_pool.cu
+++ b/cudax/test/memory_resource/device_memory_pool.cu
@@ -18,7 +18,6 @@
 
 #include <stdexcept>
 
-#include <catch2/catch.hpp>
 #include <testing.cuh>
 
 namespace cudax = cuda::experimental;
diff --git a/cudax/test/memory_resource/device_memory_resource.cu b/cudax/test/memory_resource/device_memory_resource.cu
index aefbb8b1bf7..44402c430e1 100644
--- a/cudax/test/memory_resource/device_memory_resource.cu
+++ b/cudax/test/memory_resource/device_memory_resource.cu
@@ -16,7 +16,7 @@
 
 #include <stdexcept>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 namespace cudax = cuda::experimental;
diff --git a/cudax/test/memory_resource/get_memory_resource.cu b/cudax/test/memory_resource/get_memory_resource.cu
index c61967fa7a5..389bb955624 100644
--- a/cudax/test/memory_resource/get_memory_resource.cu
+++ b/cudax/test/memory_resource/get_memory_resource.cu
@@ -13,7 +13,6 @@
 #include <cuda/experimental/memory_resource.cuh>
 
 #include "test_resource.cuh"
-#include <catch2/catch.hpp>
 #include <testing.cuh>
 
 using device_resource = cuda::experimental::device_memory_resource;
diff --git a/cudax/test/memory_resource/managed_memory_resource.cu b/cudax/test/memory_resource/managed_memory_resource.cu
index 1c5836192ba..c0a4f66dc62 100644
--- a/cudax/test/memory_resource/managed_memory_resource.cu
+++ b/cudax/test/memory_resource/managed_memory_resource.cu
@@ -17,7 +17,7 @@
 
 #include <stdexcept>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 namespace cudax = cuda::experimental;
diff --git a/cudax/test/memory_resource/pinned_memory_resource.cu b/cudax/test/memory_resource/pinned_memory_resource.cu
index 4240491c6a3..bcbe3a315ec 100644
--- a/cudax/test/memory_resource/pinned_memory_resource.cu
+++ b/cudax/test/memory_resource/pinned_memory_resource.cu
@@ -18,7 +18,7 @@
 #include <stdexcept>
 
 #include "cuda/__memory_resource/resource_ref.h"
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 namespace cudax = cuda::experimental;
diff --git a/cudax/test/memory_resource/shared_resource.cu b/cudax/test/memory_resource/shared_resource.cu
index cd279ab0b9e..29e6122774e 100644
--- a/cudax/test/memory_resource/shared_resource.cu
+++ b/cudax/test/memory_resource/shared_resource.cu
@@ -12,7 +12,6 @@
 #include <cuda/experimental/memory_resource.cuh>
 
 #include "test_resource.cuh"
-#include <catch2/catch.hpp>
 #include <testing.cuh>
 
 TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource]", big_resource, small_resource)
diff --git a/cudax/test/memory_resource/test_resource.cuh b/cudax/test/memory_resource/test_resource.cuh
index 75cd9b665b2..644cace7abc 100644
--- a/cudax/test/memory_resource/test_resource.cuh
+++ b/cudax/test/memory_resource/test_resource.cuh
@@ -8,7 +8,6 @@
 #include <cstddef>
 #include <cstdint>
 
-#include <catch2/catch.hpp>
 #include <testing.cuh>
 
 using std::size_t;
diff --git a/cudax/test/stream/get_stream.cu b/cudax/test/stream/get_stream.cu
index 43cb9921990..5e0f6417ac9 100644
--- a/cudax/test/stream/get_stream.cu
+++ b/cudax/test/stream/get_stream.cu
@@ -10,7 +10,7 @@
 
 #include <cuda/experimental/stream.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 TEST_CASE("Can call get_stream on a cudaStream_t", "[stream]")
diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu
index fd50ab7adf7..62cc8dcad45 100644
--- a/cudax/test/stream/stream_smoke.cu
+++ b/cudax/test/stream/stream_smoke.cu
@@ -11,7 +11,7 @@
 #include <cuda/experimental/launch.cuh>
 #include <cuda/experimental/stream.cuh>
 
-#include <catch2/catch.hpp>
+#include <testing.cuh>
 #include <utility.cuh>
 
 TEST_CASE("Can create a stream and launch work into it", "[stream]")
@@ -122,7 +122,7 @@ TEST_CASE("Stream get device", "[stream]")
   auto stream_ref_cudart = cudax::stream_ref(stream_handle);
   CUDAX_REQUIRE(stream_ref_cudart.device() == *std::prev(cudax::devices.end()));
 
-  INFO("Can create a side stream using logical device")
+  INFO("Can create a side stream using logical device");
   {
     if (test::cuda_driver_version() >= 12050)
     {
diff --git a/docs/cccl_development/macro.rst b/docs/cccl_development/macro.rst
index 6bf1b0b67ab..30de6aa8b10 100644
--- a/docs/cccl_development/macro.rst
+++ b/docs/cccl_development/macro.rst
@@ -264,13 +264,15 @@ Usage example:
 
 **Portable Builtin Macros**:
 
-+-----------------------------+--------------------------------------------+
-| ``_CCCL_UNREACHABLE()``     | Portable ``__builtin_unreachable()``       |
-+-----------------------------+--------------------------------------------+
-| ``_CCCL_BUILTIN_ASSUME(X)`` | Portable ``__builtin_assume(X)``           |
-+-----------------------------+--------------------------------------------+
-| ``_CCCL_BUILTIN_EXPECT(X)`` | Portable ``__builtin_expected(X)``         |
-+-----------------------------+--------------------------------------------+
++---------------------------------------+--------------------------------------------+
+| ``_CCCL_UNREACHABLE()``               | Portable ``__builtin_unreachable()``       |
++---------------------------------------+--------------------------------------------+
+| ``_CCCL_BUILTIN_ASSUME(X)``           | Portable ``__builtin_assume(X)``           |
++---------------------------------------+--------------------------------------------+
+| ``_CCCL_BUILTIN_EXPECT(X)``           | Portable ``__builtin_expected(X)``         |
++---------------------------------------+--------------------------------------------+
+| ``_CCCL_BUILTIN_PREFETCH(X[, Y, Z])`` | Portable ``__builtin_prefetch(X, Y, Z)``   |
++---------------------------------------+--------------------------------------------+
 
 **Portable Keyword Macros**
 
diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst
index a0a78ed0d71..4cc639e27fb 100644
--- a/docs/cub/developer_overview.rst
+++ b/docs/cub/developer_overview.rst
@@ -239,8 +239,8 @@ For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implement
 
     using InternalWarpReduce = cuda::std::conditional_t<
       IS_POW_OF_TWO,
-      WarpReduceShfl<T, LOGICAL_WARP_THREADS>,  // shuffle-based implementation
-      WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation
+      detail::WarpReduceShfl<T, LOGICAL_WARP_THREADS>,  // shuffle-based implementation
+      detail::WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation
 
 Specializations provide different shared memory requirements,
 so the actual ``_TempStorage`` type is defined as:
diff --git a/docs/cuda_parallel/index.rst b/docs/cuda_parallel/index.rst
index e494fb1e323..c54feb81f85 100644
--- a/docs/cuda_parallel/index.rst
+++ b/docs/cuda_parallel/index.rst
@@ -22,3 +22,9 @@ Iterators
   :members:
   :undoc-members:
   :imported-members:
+
+Utilities
+---------
+
+.. automodule:: cuda.parallel.experimental.struct
+   :members:
diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst
index 5e9af18aae2..59c6068a09c 100644
--- a/docs/libcudacxx/extended_api/math.rst
+++ b/docs/libcudacxx/extended_api/math.rst
@@ -1,52 +1,28 @@
 .. _libcudacxx-extended-api-math:
 
 Math
-=====
+====
 
-.. code:: cuda
+.. toctree::
+   :hidden:
+   :maxdepth: 1
 
-   template <typename T>
-   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept;
+   cuda::ceil_div <math/ceil_div>
+   cuda::round_up <math/round_up>
+   cuda::round_down <math/round_down>
 
-ceil_div
----------
+.. list-table::
+   :widths: 25 45 30
+   :header-rows: 0
 
-- _Requires_: `is_integral_v<T>` is true.
-- _Preconditions_: `a >= 0` is true and `b > 0` is true.
-- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value.
+   * - :ref:`ceil_div <libcudacxx-extended-api-math-ceil-div>`
+     - Ceiling division
+     - CCCL 2.6.0 / CUDA 12.6
 
-.. note::
+   * - :ref:`round_up <libcudacxx-extended-api-math-round-up>`
+     - Round to the next multiple
+     - CCCL 2.9.0 / CUDA 12.9
 
-   The function is only constexpr from C++14 onwards
-
-**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
-
-.. code:: cuda
-
-   #include <vector>
-   #include <cuda/cmath>
-
-   __global__ void vscale(int n, float s, float *x) {
-     int i = blockIdx.x * blockDim.x + threadIdx.x;
-     if (i < n) x[i] *= s;
-   }
-
-   int main() {
-     const int n = 100000;
-     const float s = 2.f;
-     std::vector<float> x(n, 1.f);
-
-     // Given a fixed number of threads per block...
-     constexpr int threads_per_block = 256;
-
-     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
-     // requiring the kernel to be launched with an extra thread block to handle it.
-     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
-
-     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
-     cudaDeviceSynchronize();
-
-     return 0;
-   }
-
-`See it on Godbolt TODO`
+   * - :ref:`round_down <libcudacxx-extended-api-math-round-down>`
+     - Round to the previous multiple
+     - CCCL 2.9.0 / CUDA 12.9
diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
new file mode 100644
index 00000000000..df6d8c973fa
--- /dev/null
+++ b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -0,0 +1,52 @@
+.. _libcudacxx-extended-api-math-ceil-div:
+
+``ceil_div`` Ceiling Division
+=============================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
+
+``value``: The value to be divided.
+``divisor``:  The divisor.
+
+- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
+
+.. note::
+
+   The function is only constexpr from C++14 onwards
+
+**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
+
+.. code:: cuda
+
+   #include <vector>
+   #include <cuda/cmath>
+
+   __global__ void vscale(int n, float s, float *x) {
+     int i = blockIdx.x * blockDim.x + threadIdx.x;
+     if (i < n) x[i] *= s;
+   }
+
+   int main() {
+     const int n = 100000;
+     const float s = 2.f;
+     std::vector<float> x(n, 1.f);
+
+     // Given a fixed number of threads per block...
+     constexpr int threads_per_block = 256;
+
+     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
+     // requiring the kernel to be launched with an extra thread block to handle it.
+     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
+
+     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
+     cudaDeviceSynchronize();
+
+     return 0;
+   }
+
+`See it on Godbolt TODO`
diff --git a/docs/libcudacxx/extended_api/math/round_down.rst b/docs/libcudacxx/extended_api/math/round_down.rst
new file mode 100644
index 00000000000..20a80998fd3
--- /dev/null
+++ b/docs/libcudacxx/extended_api/math/round_down.rst
@@ -0,0 +1,38 @@
+.. _libcudacxx-extended-api-math-round-down:
+
+``round_down`` Round to the previous multiple
+=============================================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr cuda::std::common_type_t<T, U> round_down(T value, U base_multiple) noexcept;
+
+``value``: The value to be rounded down.
+``base_multiple``:  The base multiple to which the value rounds down.
+
+- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
+
+.. note::
+
+   The function requires C++17 onwards
+
+**Performance considerations**:
+
+- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer
+
+**Example**:
+
+.. code:: cuda
+
+   #include <cuda/cmath>
+
+   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
+     // a = 7, b = 3 -> result = 6
+     *result = cuda::round_down(a, b);
+   }
+
+`See it on Godbolt TODO`
diff --git a/docs/libcudacxx/extended_api/math/round_up.rst b/docs/libcudacxx/extended_api/math/round_up.rst
new file mode 100644
index 00000000000..13c282aaad7
--- /dev/null
+++ b/docs/libcudacxx/extended_api/math/round_up.rst
@@ -0,0 +1,40 @@
+.. _libcudacxx-extended-api-math-round-up:
+
+``round_up`` Round to the next multiple
+=======================================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr cuda::std::common_type_t<T, U> round_up(T value, U base_multiple) noexcept;
+
+``value``: The value to be rounded up.
+``base_multiple``:  The base multiple to which the value rounds up.
+
+- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
+- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of
+          ``a`` and ``b``. The condition is checked in debug mode.
+
+.. note::
+
+   The function requires C++17 onwards
+
+**Performance considerations**:
+
+- The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication
+
+**Example**:
+
+.. code:: cuda
+
+   #include <cuda/cmath>
+
+   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
+     // a = 7, b = 3 -> result = 9
+     *result = cuda::round_up(a, b);
+   }
+
+`See it on Godbolt TODO`
diff --git a/docs/libcudacxx/standard_api.rst b/docs/libcudacxx/standard_api.rst
index be806240615..dda4e56b9d9 100644
--- a/docs/libcudacxx/standard_api.rst
+++ b/docs/libcudacxx/standard_api.rst
@@ -14,6 +14,7 @@ Standard API
    standard_api/ranges_library
    standard_api/synchronization_library
    standard_api/time_library
+   standard_api/type_support
    standard_api/utility_library
 
 Standard Library Backports
@@ -112,7 +113,14 @@ Feature availability:
 
 -  C++26 ``std::dims`` is available in C++14.
 
--  C++23 ``forward_like``, ``to_underlying`` and ``unreachable`` from ``<utility>`` are available in C++11.
+-  C++26 ``std::linalg`` accessors, transposed layout, and related functions are available in C++17.
+
+   - ``scaled()`` and ``scaled_accessor``
+   - ``conjugated()`` and ``conjugated_accessor``
+   - ``transposed()`` and ``layout_transpose``
+   - ``conjugate_transposed()``
+
+-  C++23 ``forward_like``, ``to_underlying``, and ``unreachable`` from ``<utility>`` are available in C++11.
 
 -  C++23 ``is_scoped_enum`` in ``<type_traits>`` is available in C++11.
 
diff --git a/docs/libcudacxx/standard_api/c_library.rst b/docs/libcudacxx/standard_api/c_library.rst
index 9751a1dcb4e..122b15998a3 100644
--- a/docs/libcudacxx/standard_api/c_library.rst
+++ b/docs/libcudacxx/standard_api/c_library.rst
@@ -30,3 +30,6 @@ Any Standard C++ header not listed below is omitted.
    * - `\<cuda/std/cstdlib\> <https://en.cppreference.com/w/cpp/header/cstdlib>`_
      - Common utilities
      - libcu++ 2.2.0 / CCCL 2.2.0 / CUDA 12.3
+   * - `\<cuda/std/cstring\> <https://en.cppreference.com/w/cpp/header/cstring>`_
+     - Provides array manipulation functions `memcpy` and `memset`
+     - CCCL 3.0.0
diff --git a/docs/libcudacxx/standard_api/numerics_library.rst b/docs/libcudacxx/standard_api/numerics_library.rst
index 5310cd6ddf9..4181e301864 100644
--- a/docs/libcudacxx/standard_api/numerics_library.rst
+++ b/docs/libcudacxx/standard_api/numerics_library.rst
@@ -10,6 +10,7 @@ Numerics Library
    numerics_library/bit
    numerics_library/complex
    numerics_library/numeric
+   numerics_library/linalg
 
 Any Standard C++ header not listed below is omitted.
 
diff --git a/docs/libcudacxx/standard_api/numerics_library/linalg.rst b/docs/libcudacxx/standard_api/numerics_library/linalg.rst
new file mode 100644
index 00000000000..cc034eeab94
--- /dev/null
+++ b/docs/libcudacxx/standard_api/numerics_library/linalg.rst
@@ -0,0 +1,31 @@
+.. _libcudacxx-standard-api-numerics-linalg:
+
+``<cuda/std/linalg>``
+============================================
+
+Provided functionalities
+------------------------
+
+- ``scaled()`` `std::linalg::scaled <https://en.cppreference.com/w/cpp/numeric/linalg/scaled>`_
+- ``scaled_accessor`` `std::linalg::scaled_accessor <https://en.cppreference.com/w/cpp/numeric/linalg/scaled_accessor>`_
+- ``conjugated()`` `std::linalg::conjugated <https://en.cppreference.com/w/cpp/numeric/linalg/conjugated>`_
+- ``conjugated_accessor`` `std::linalg::conjugated_accessor <https://en.cppreference.com/w/cpp/numeric/linalg/conjugated_accessor>`_
+- ``transposed()`` `std::linalg::transposed <https://en.cppreference.com/w/cpp/numeric/linalg/transposed>`_
+- ``layout_transpose`` `std::linalg::layout_transpose <https://en.cppreference.com/w/cpp/numeric/linalg/layout_transpose>`_
+- ``conjugate_transposed()`` `std::linalg::conjugate_transposed <https://en.cppreference.com/w/cpp/numeric/linalg/conjugate_transposed>`_
+
+Extensions
+----------
+
+-  C++26 ``std::linalg`` accessors, transposed layout, and related functions are available in C++17
+
+Omissions
+---------
+
+-  Currently we do not expose any BLAS functions and layouts.
+
+Restrictions
+------------
+
+-  On device no exceptions are thrown in case of a bad access.
+-  MSVC is only supported with C++20
diff --git a/docs/libcudacxx/standard_api/ranges_library.rst b/docs/libcudacxx/standard_api/ranges_library.rst
index 25841bd7b5a..61e023928bc 100644
--- a/docs/libcudacxx/standard_api/ranges_library.rst
+++ b/docs/libcudacxx/standard_api/ranges_library.rst
@@ -13,10 +13,10 @@ See the documentation of the standard headers `\<iterator\> <https://en.cpprefer
    * - Header
      - Content
      - Availability
-   * - <cuda/std/iterator>
+   * - `\<cuda/std/iterator\> <https://en.cppreference.com/w/cpp/header/iterator>`_
      - Iterator related concepts and machinery such as ``cuda::std::forward_iterator``
      - CCCL 2.3.0 / CUDA 12.4
-   * - <cuda/std/ranges>
+   * - `\<cuda/std/ranges\> <https://en.cppreference.com/w/cpp/header/ranges>`_
      - Range related concepts and machinery such as ``cuda::std::ranges::forward_range`` and ``cuda::std::ranges::subrange``
      - CCCL 2.4.0 / CUDA 12.5
 
diff --git a/docs/libcudacxx/standard_api/type_support.rst b/docs/libcudacxx/standard_api/type_support.rst
new file mode 100644
index 00000000000..52b64de6a62
--- /dev/null
+++ b/docs/libcudacxx/standard_api/type_support.rst
@@ -0,0 +1,24 @@
+.. _libcudacxx-standard-api-type-support:
+
+Type Support Library
+=======================
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+
+Any Standard C++ header not listed below is omitted.
+
+.. list-table::
+   :widths: 25 45 30
+   :header-rows: 1
+
+   * - Header
+     - Content
+     - Availability
+   * - `\<cuda/std/climits\> <https://en.cppreference.com/w/cpp/header/climits>`_
+     - Limits of integral types
+     - libcu++ 1.0.0 / CCCL 2.0.0 / CUDA 10.2
+   * - `\<cuda/std/limits\> <https://en.cppreference.com/w/cpp/header/limits>`_
+     - Interface to query properties of all fundamental numeric types
+     - libcu++ 1.0.0 / CCCL 2.0.0 / CUDA 10.2
diff --git a/docs/repo.toml b/docs/repo.toml
index e949beb6e7c..999d62a8f20 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -347,6 +347,7 @@ autodoc.mock_imports = [
     "numba",
     "pynvjitlink",
     "cuda.bindings",
+    "cuda.cccl",
     "llvmlite",
     "numpy",
 ]
diff --git a/examples/basic/CMakeLists.txt b/examples/basic/CMakeLists.txt
index f664422335e..cc50b7d1dde 100644
--- a/examples/basic/CMakeLists.txt
+++ b/examples/basic/CMakeLists.txt
@@ -41,6 +41,7 @@ endif()
 
 # Creates a cmake executable target for the main program
 add_executable(example_project example.cu)
+target_compile_features(example_project PUBLIC cuda_std_17)
 
 # "Links" the CCCL Cmake target to the `example_project` executable. This configures everything needed to use
 # CCCL headers, including setting up include paths, compiler flags, etc.
diff --git a/libcudacxx/examples/trie.cu b/libcudacxx/examples/trie.cu
index 9144b8bc41c..f14e81eaabf 100644
--- a/libcudacxx/examples/trie.cu
+++ b/libcudacxx/examples/trie.cu
@@ -149,19 +149,19 @@ inline void assert_(cudaError_t code, const char* file, int line)
 template <class T>
 struct managed_allocator
 {
-  typedef cuda::std::size_t size_type;
-  typedef cuda::std::ptrdiff_t difference_type;
+  using size_type       = cuda::std::size_t;
+  using difference_type = cuda::std::ptrdiff_t;
 
-  typedef T value_type;
-  typedef T* pointer; // (deprecated in C++17)(removed in C++20) T*
-  typedef const T* const_pointer; // (deprecated in C++17)(removed in C++20) const T*
-  typedef T& reference; // (deprecated in C++17)(removed in C++20) T&
-  typedef const T& const_reference; // (deprecated in C++17)(removed in C++20) const T&
+  using value_type      = T;
+  using pointer         = T*; // (deprecated in C++17)(removed in C++20) T*
+  using const_pointer   = const T*; // (deprecated in C++17)(removed in C++20) const T*
+  using reference       = T&; // (deprecated in C++17)(removed in C++20) T&
+  using const_reference = const T&; // (deprecated in C++17)(removed in C++20) const T&
 
   template <class U>
   struct rebind
   {
-    typedef managed_allocator<U> other;
+    using other = managed_allocator<U>;
   };
   managed_allocator() = default;
   template <class U>
diff --git a/libcudacxx/include/cuda/__cmath/round_down.h b/libcudacxx/include/cuda/__cmath/round_down.h
new file mode 100644
index 00000000000..7bf42f050f9
--- /dev/null
+++ b/libcudacxx/include/cuda/__cmath/round_down.h
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___CMATH_ROUND_DOWN_H
+#define _CUDA___CMATH_ROUND_DOWN_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/common_type.h>
+#  include <cuda/std/__type_traits/is_enum.h>
+#  include <cuda/std/__type_traits/is_integral.h>
+#  include <cuda/std/__type_traits/is_signed.h>
+#  include <cuda/std/__type_traits/make_unsigned.h>
+#  include <cuda/std/__utility/to_underlying.h>
+#  include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  _CCCL_ASSERT(__b > _Up{0}, "cuda::round_down: 'b' must be positive");
+  if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
+  {
+    _CCCL_ASSERT(__a >= _Tp{0}, "cuda::round_down: 'a' must be non negative");
+  }
+  using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>;
+  using _Prom   = decltype(_Tp{} / _Up{});
+  using _UProm  = _CUDA_VSTD::make_unsigned_t<_Prom>;
+  auto __c1     = static_cast<_UProm>(__a) / static_cast<_UProm>(__b);
+  return static_cast<_Common>(__c1 * static_cast<_UProm>(__b));
+}
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_down(__a, _CUDA_VSTD::to_underlying(__b));
+}
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), __b);
+}
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>,
+                                                              _CUDA_VSTD::underlying_type_t<_Up>>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+#endif // _CUDA___CMATH_ROUND_DOWN_H
diff --git a/libcudacxx/include/cuda/__cmath/round_up.h b/libcudacxx/include/cuda/__cmath/round_up.h
new file mode 100644
index 00000000000..cf9bb9975f5
--- /dev/null
+++ b/libcudacxx/include/cuda/__cmath/round_up.h
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___CMATH_ROUND_UP_H
+#define _CUDA___CMATH_ROUND_UP_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__cmath/ceil_div.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/common_type.h>
+#  include <cuda/std/__type_traits/is_enum.h>
+#  include <cuda/std/__type_traits/is_integral.h>
+#  include <cuda/std/__type_traits/is_signed.h>
+#  include <cuda/std/__type_traits/make_unsigned.h>
+#  include <cuda/std/__utility/to_underlying.h>
+#  include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! @brief Round the number \p __a to the next multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up>
+round_up(const _Tp __a, const _Up __b) noexcept
+{
+  _CCCL_ASSERT(__b > _Up{0}, "cuda::round_up: 'b' must be positive");
+  if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
+  {
+    _CCCL_ASSERT(__a >= _Tp{0}, "cuda::round_up: 'a' must be non negative");
+  }
+  using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>;
+  using _Prom   = decltype(_Tp{} / _Up{});
+  auto __c      = ::cuda::ceil_div(static_cast<_Prom>(__a), static_cast<_Prom>(__b));
+  _CCCL_ASSERT(static_cast<_Common>(__c) <= _CUDA_VSTD::numeric_limits<_Common>::max() / static_cast<_Common>(__b),
+               "cuda::round_up: result overflow");
+  return static_cast<_Common>(static_cast<_Prom>(__c) * static_cast<_Prom>(__b));
+}
+
+//! @brief Round the number \p __a to the next multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>>
+round_up(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_up(__a, _CUDA_VSTD::to_underlying(__b));
+}
+
+//! @brief Round the number \p __a to the next multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up>
+round_up(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_up(_CUDA_VSTD::to_underlying(__a), __b);
+}
+
+//! @brief Round the number \p __a to the next multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>,
+                                                              _CUDA_VSTD::underlying_type_t<_Up>>
+round_up(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_up(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+#endif // _CUDA___CMATH_ROUND_UP_H
diff --git a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h
index cb8fcb69083..72c413d65a4 100644
--- a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h
+++ b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h
@@ -28,7 +28,7 @@
 #include <cuda/__memcpy_async/cp_async_shared_global.h>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
-#include <cuda/std/detail/libcxx/include/cstring>
+#include <cuda/std/cstring>
 
 #include <nv/target>
 
@@ -135,7 +135,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memc
     (
       // Host code path:
       if (__group.thread_rank() == 0) {
-        memcpy(__dest_char, __src_char, __size);
+        _CUDA_VSTD::memcpy(__dest_char, __src_char, __size);
       } return __completion_mechanism::__sync;));
 }
 
diff --git a/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h
index 9ce2b455d59..f5af15bd51a 100644
--- a/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h
+++ b/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h
@@ -26,6 +26,8 @@
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
+#if _CCCL_HAS_CUDA_COMPILER
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 template <int __n>
@@ -103,9 +105,9 @@ inline _CCCL_DEVICE _Tp* __from_ptr_gmem(_CUDA_VSTD::size_t __ptr)
 template <typename _Tp>
 inline _CCCL_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val)
 {
-#if _CCCL_STD_VER >= 2017
+#  if _CCCL_STD_VER >= 2017
   static_assert(sizeof(_Tp) == 4, "");
-#endif // _CCCL_STD_VER >= 2017
+#  endif // _CCCL_STD_VER >= 2017
   // Consider using std::bitcast
   return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val);
 }
@@ -113,13 +115,15 @@ inline _CCCL_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val)
 template <typename _Tp>
 inline _CCCL_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val)
 {
-#if _CCCL_STD_VER >= 2017
+#  if _CCCL_STD_VER >= 2017
   static_assert(sizeof(_Tp) == 8, "");
-#endif // _CCCL_STD_VER >= 2017
+#  endif // _CCCL_STD_VER >= 2017
   // Consider using std::bitcast
   return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
+#endif // _CCCL_HAS_CUDA_COMPILER
+
 #endif // _CUDA_PTX_HELPER_FUNCTIONS_H_
diff --git a/libcudacxx/include/cuda/__type_traits/is_floating_point.h b/libcudacxx/include/cuda/__type_traits/is_floating_point.h
new file mode 100644
index 00000000000..e253315a672
--- /dev/null
+++ b/libcudacxx/include/cuda/__type_traits/is_floating_point.h
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDA__TYPE_TRAITS_IS_FLOATING_POINT_H
+#define __CUDA__TYPE_TRAITS_IS_FLOATING_POINT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/remove_cv.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <class _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_floating_point
+    : _CUDA_VSTD::bool_constant<_CUDA_VSTD::is_floating_point<_CUDA_VSTD::remove_cv_t<_Tp>>::value
+                                || _CUDA_VSTD::__is_extended_floating_point<_CUDA_VSTD::remove_cv_t<_Tp>>::value>
+{};
+
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool is_floating_point_v =
+  _CUDA_VSTD::is_floating_point_v<_CUDA_VSTD::remove_cv_t<_Tp>>
+  || _CUDA_VSTD::__is_extended_floating_point_v<_CUDA_VSTD::remove_cv_t<_Tp>>;
+#endif // !_CCCL_NO_VARIABLE_TEMPLATES
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // __CUDA__TYPE_TRAITS_IS_FLOATING_POINT_H
diff --git a/libcudacxx/include/cuda/cmath b/libcudacxx/include/cuda/cmath
index 3de1cc6e920..20683c8676f 100644
--- a/libcudacxx/include/cuda/cmath
+++ b/libcudacxx/include/cuda/cmath
@@ -22,6 +22,8 @@
 #endif // no system header
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__cmath/round_down.h>
+#include <cuda/__cmath/round_up.h>
 #include <cuda/std/cmath>
 
 #endif // _CUDA_CMATH
diff --git a/libcudacxx/include/cuda/discard_memory b/libcudacxx/include/cuda/discard_memory
index 6da2ea209c4..5177b7ee407 100644
--- a/libcudacxx/include/cuda/discard_memory
+++ b/libcudacxx/include/cuda/discard_memory
@@ -21,11 +21,12 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbytes) noexcept
+inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, _CUDA_VSTD::size_t __nbytes) noexcept
 {
   // The discard PTX instruction is only available with PTX ISA 7.4 and later
 #if __cccl_ptx_isa < 740ULL
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index 7946e8bdc91..a96beb3a520 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -141,6 +141,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__memcpy_async/completion_mechanism.h>
+#include <cuda/__memcpy_async/memcpy_async_barrier.h>
 #include <cuda/atomic>
 #include <cuda/barrier>
 #include <cuda/std/chrono>
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h
index f4013d4ea73..c42ed39fd5f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy.h
@@ -28,8 +28,8 @@
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/remove_const.h>
 #include <cuda/std/cstdint>
-#include <cuda/std/cstdlib> // ::memmove
-#include <cuda/std/detail/libcxx/include/cstring>
+#include <cuda/std/cstdlib>
+#include <cuda/std/cstring> // memmove
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__algorithm/for_each_n.h b/libcudacxx/include/cuda/std/__algorithm/for_each_n.h
index 54428acb69d..ceca1e67af3 100644
--- a/libcudacxx/include/cuda/std/__algorithm/for_each_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/for_each_n.h
@@ -28,8 +28,8 @@ template <class _InputIterator, class _Size, class _Function>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f)
 {
-  typedef decltype(_CUDA_VSTD::__convert_to_integral(__orig_n)) _IntegralSize;
-  _IntegralSize __n = __orig_n;
+  using _IntegralSize = decltype(_CUDA_VSTD::__convert_to_integral(__orig_n));
+  _IntegralSize __n   = __orig_n;
   while (__n > 0)
   {
     __f(*__first);
diff --git a/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h b/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h
index ace9c539664..f62e0c9d5ed 100644
--- a/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h
+++ b/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h
@@ -30,7 +30,7 @@ template <class _Compare, class _RandomAccessIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 __is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp)
 {
-  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
+  using difference_type      = typename iterator_traits<_RandomAccessIterator>::difference_type;
   difference_type __len      = __last - __first;
   difference_type __p        = 0;
   difference_type __c        = 1;
diff --git a/libcudacxx/include/cuda/std/__algorithm/partition_point.h b/libcudacxx/include/cuda/std/__algorithm/partition_point.h
index 446c74bad42..f4c17edbf55 100644
--- a/libcudacxx/include/cuda/std/__algorithm/partition_point.h
+++ b/libcudacxx/include/cuda/std/__algorithm/partition_point.h
@@ -31,7 +31,7 @@ template <class _ForwardIterator, class _Predicate>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 partition_point(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
-  typedef typename iterator_traits<_ForwardIterator>::difference_type difference_type;
+  using difference_type = typename iterator_traits<_ForwardIterator>::difference_type;
   difference_type __len = _CUDA_VSTD::distance(__first, __last);
   while (__len != 0)
   {
diff --git a/libcudacxx/include/cuda/std/__algorithm/rotate.h b/libcudacxx/include/cuda/std/__algorithm/rotate.h
index 2aef1790ba0..709b6cddc0f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/rotate.h
+++ b/libcudacxx/include/cuda/std/__algorithm/rotate.h
@@ -35,8 +35,8 @@ template <class _AlgPolicy, class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 __rotate_left(_ForwardIterator __first, _ForwardIterator __last)
 {
-  typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
-  using _Ops = _IterOps<_AlgPolicy>;
+  using value_type = typename iterator_traits<_ForwardIterator>::value_type;
+  using _Ops       = _IterOps<_AlgPolicy>;
 
   value_type __tmp       = _Ops::__iter_move(__first);
   _ForwardIterator __lm1 = _CUDA_VSTD::__move<_AlgPolicy>(_Ops::next(__first), __last, __first).second;
@@ -48,8 +48,8 @@ template <class _AlgPolicy, class _BidirectionalIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator
 __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
-  typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
-  using _Ops = _IterOps<_AlgPolicy>;
+  using value_type = typename iterator_traits<_BidirectionalIterator>::value_type;
+  using _Ops       = _IterOps<_AlgPolicy>;
 
   _BidirectionalIterator __lm1 = _Ops::prev(__last);
   value_type __tmp             = _Ops::__iter_move(__lm1);
@@ -118,9 +118,9 @@ template <class _AlgPolicy, typename _RandomAccessIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last)
 {
-  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
-  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
-  using _Ops = _IterOps<_AlgPolicy>;
+  using difference_type = typename iterator_traits<_RandomAccessIterator>::difference_type;
+  using value_type      = typename iterator_traits<_RandomAccessIterator>::value_type;
+  using _Ops            = _IterOps<_AlgPolicy>;
 
   const difference_type __m1 = __middle - __first;
   const difference_type __m2 = _Ops::distance(__middle, __last);
@@ -158,7 +158,7 @@ template <class _AlgPolicy, class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __rotate_impl(
   _ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _CUDA_VSTD::forward_iterator_tag)
 {
-  typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
+  using value_type = typename iterator_traits<_ForwardIterator>::value_type;
   if (_CCCL_TRAIT(is_trivially_move_assignable, value_type))
   {
     if (_IterOps<_AlgPolicy>::next(__first) == __middle)
@@ -176,7 +176,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator __rotate_
   _BidirectionalIterator __last,
   bidirectional_iterator_tag)
 {
-  typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
+  using value_type = typename iterator_traits<_BidirectionalIterator>::value_type;
   if (_CCCL_TRAIT(is_trivially_move_assignable, value_type))
   {
     if (_IterOps<_AlgPolicy>::next(__first) == __middle)
@@ -198,7 +198,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __rotate_i
   _RandomAccessIterator __last,
   random_access_iterator_tag)
 {
-  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
+  using value_type = typename iterator_traits<_RandomAccessIterator>::value_type;
   if (_CCCL_TRAIT(is_trivially_move_assignable, value_type))
   {
     if (_IterOps<_AlgPolicy>::next(__first) == __middle)
diff --git a/libcudacxx/include/cuda/std/__algorithm/search.h b/libcudacxx/include/cuda/std/__algorithm/search.h
index 078ac059773..1b4d5ed316f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/search.h
+++ b/libcudacxx/include/cuda/std/__algorithm/search.h
@@ -93,8 +93,8 @@ __search(_RandomAccessIterator1 __first1,
          random_access_iterator_tag,
          random_access_iterator_tag)
 {
-  typedef typename iterator_traits<_RandomAccessIterator1>::difference_type _Diff1;
-  typedef typename iterator_traits<_RandomAccessIterator2>::difference_type _Diff2;
+  using _Diff1 = typename iterator_traits<_RandomAccessIterator1>::difference_type;
+  using _Diff2 = typename iterator_traits<_RandomAccessIterator2>::difference_type;
   // Take advantage of knowing source and pattern lengths.  Stop short when source is smaller than pattern
   const _Diff2 __len2 = __last2 - __first2;
   if (__len2 == 0)
diff --git a/libcudacxx/include/cuda/std/__algorithm/sift_down.h b/libcudacxx/include/cuda/std/__algorithm/sift_down.h
index d0a8f2e75aa..421728039b3 100644
--- a/libcudacxx/include/cuda/std/__algorithm/sift_down.h
+++ b/libcudacxx/include/cuda/std/__algorithm/sift_down.h
@@ -35,8 +35,8 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __sift_down(
 {
   using _Ops = _IterOps<_AlgPolicy>;
 
-  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
-  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
+  using difference_type = typename iterator_traits<_RandomAccessIterator>::difference_type;
+  using value_type      = typename iterator_traits<_RandomAccessIterator>::value_type;
   // left-child of __start is at 2 * __start + 1
   // right-child of __start is at 2 * __start + 2
   difference_type __child = __start - __first;
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h
index 2ecd56daf55..be06fbd34d1 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h
@@ -22,6 +22,7 @@
 
 #include <cuda/std/__atomic/types/common.h>
 #include <cuda/std/cstdint>
+#include <cuda/std/cstring>
 
 // This file works around a bug in CUDA in which the compiler miscompiles
 // atomics to automatic storage (local memory). This bug is not fixed on any
@@ -96,7 +97,7 @@ _CCCL_DEVICE inline bool __cuda_load_weak_if_local(const volatile void* __ptr, v
   {
     return false;
   }
-  memcpy(__ret, const_cast<const void*>(__ptr), __size);
+  _CUDA_VSTD::memcpy(__ret, const_cast<const void*>(__ptr), __size);
   // Required to workaround a compiler bug, see nvbug/4064730
   NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
   return true;
@@ -108,7 +109,7 @@ _CCCL_DEVICE inline bool __cuda_store_weak_if_local(volatile void* __ptr, const
   {
     return false;
   }
-  memcpy(const_cast<void*>(__ptr), __val, __size);
+  _CUDA_VSTD::memcpy(const_cast<void*>(__ptr), __val, __size);
   return true;
 }
 
@@ -122,12 +123,12 @@ __cuda_compare_exchange_weak_if_local(volatile _Type* __ptr, _Type* __expected,
   }
   if (__atomic_memcmp(const_cast<const _Type*>(__ptr), const_cast<const _Type*>(__expected), sizeof(_Type)) == 0)
   {
-    memcpy(const_cast<_Type*>(__ptr), const_cast<_Type const*>(__desired), sizeof(_Type));
+    _CUDA_VSTD::memcpy(const_cast<_Type*>(__ptr), const_cast<_Type const*>(__desired), sizeof(_Type));
     *__success = true;
   }
   else
   {
-    memcpy(const_cast<_Type*>(__expected), const_cast<_Type const*>(__ptr), sizeof(_Type));
+    _CUDA_VSTD::memcpy(const_cast<_Type*>(__expected), const_cast<_Type const*>(__ptr), sizeof(_Type));
     *__success = false;
   }
   NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
@@ -141,8 +142,8 @@ _CCCL_DEVICE bool __cuda_exchange_weak_if_local(volatile _Type* __ptr, _Type* __
   {
     return false;
   }
-  memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
-  memcpy(const_cast<_Type*>(__ptr), const_cast<const _Type*>(__val), sizeof(_Type));
+  _CUDA_VSTD::memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
+  _CUDA_VSTD::memcpy(const_cast<_Type*>(__ptr), const_cast<const _Type*>(__val), sizeof(_Type));
   NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
   return true;
 }
@@ -154,7 +155,7 @@ _CCCL_DEVICE bool __cuda_fetch_weak_if_local(volatile _Type* __ptr, _Type __val,
   {
     return false;
   }
-  memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
+  _CUDA_VSTD::memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
   __bop(*__ptr, __val);
   NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
   return true;
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index 27136bb3244..bafbd86fe5f 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -84,15 +84,14 @@ inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
 
 #else // ^^^ C++20 ^^^ / vvv C++17 vvv
 
-typedef enum memory_order
-{
+using memory_order = enum memory_order {
   memory_order_relaxed = __mo_relaxed,
   memory_order_consume = __mo_consume,
   memory_order_acquire = __mo_acquire,
   memory_order_release = __mo_release,
   memory_order_acq_rel = __mo_acq_rel,
   memory_order_seq_cst = __mo_seq_cst,
-} memory_order;
+};
 
 #endif // _CCCL_STD_VER >= 2020
 
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index 6706ad5181b..5d1f5f2d654 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -25,7 +25,7 @@
 #include <cuda/std/__type_traits/is_assignable.h>
 #include <cuda/std/__type_traits/remove_cv.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
-#include <cuda/std/detail/libcxx/include/cstring>
+#include <cuda/std/cstring>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -92,7 +92,7 @@ _CCCL_HOST_DEVICE inline int __atomic_memcmp(void const* __lhs, void const* __rh
        }
      } return 0;),
     NV_IS_HOST,
-    (return memcmp(__lhs, __rhs, __count);))
+    (return _CUDA_VSTD::memcmp(__lhs, __rhs, __count);))
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
index a4e969f0936..e9ce704b4c1 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -28,6 +28,7 @@
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_arithmetic.h>
 #include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/cstring>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -53,7 +54,7 @@ template <class _Tp, enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
 {
   __atomic_small_proxy_t<_Tp> __temp{};
-  memcpy(&__temp, &__val, sizeof(_Tp));
+  _CUDA_VSTD::memcpy(&__temp, &__val, sizeof(_Tp));
   return __temp;
 }
 
@@ -61,7 +62,7 @@ template <class _Tp, enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
 {
   _Tp __temp{};
-  memcpy(&__temp, &__val, sizeof(_Tp));
+  _CUDA_VSTD::memcpy(&__temp, &__val, sizeof(_Tp));
   return __temp;
 }
 
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index 1fc40a07665..00646e47984 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -25,6 +25,10 @@
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/wait/polling.h>
 
+#if !_CCCL_COMPILER(NVRTC)
+#  include <cstring>
+#endif // !_CCCL_COMPILER(NVRTC)
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
@@ -56,7 +60,7 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp c
 #if _CCCL_HAS_CUDA_COMPILER
   return __lhs == __rhs;
 #else
-  return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
+  return _CUDA_VSTD::memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
 #endif
 }
 
diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h
index a0579942d90..7e265232d0b 100644
--- a/libcudacxx/include/cuda/std/__bit/bit_cast.h
+++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h
@@ -25,7 +25,7 @@
 #include <cuda/std/__type_traits/is_extended_floating_point.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/is_trivially_default_constructible.h>
-#include <cuda/std/detail/libcxx/include/cstring>
+#include <cuda/std/cstring>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__bit/reference.h b/libcudacxx/include/cuda/std/__bit/reference.h
index 12acac014b1..13a5898014d 100644
--- a/libcudacxx/include/cuda/std/__bit/reference.h
+++ b/libcudacxx/include/cuda/std/__bit/reference.h
@@ -30,7 +30,6 @@
 #include <cuda/std/__memory/pointer_traits.h>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__utility/swap.h>
-#include <cuda/std/detail/libcxx/include/cstring>
 
 _CCCL_PUSH_MACROS
 
diff --git a/libcudacxx/include/cuda/std/__cccl/assert.h b/libcudacxx/include/cuda/std/__cccl/assert.h
index 5ef9314f310..b8acc604a65 100644
--- a/libcudacxx/include/cuda/std/__cccl/assert.h
+++ b/libcudacxx/include/cuda/std/__cccl/assert.h
@@ -122,13 +122,14 @@ _CCCL_HOST_DEVICE
 
 //! _CCCL_VERIFY is enabled unconditionally and reserved for critical checks that are required to always be on
 //! _CCCL_ASSERT is enabled conditionally depending on CCCL_ENABLE_HOST_ASSERTIONS and CCCL_ENABLE_DEVICE_ASSERTIONS
-#if _CCCL_CUDA_COMPILER(NVHPC) // NVHPC needs to use NV_IF_TARGET instead of __CUDA_ARCH__
-#  define _CCCL_VERIFY(expression, message) \
-    NV_IF_ELSE_TARGET(                      \
-      NV_IS_DEVICE, (_CCCL_ASSERT_IMPL_DEVICE(expression, message);), (_CCCL_ASSERT_IMPL_HOST(expression, message);))
-#  define _CCCL_ASSERT(expression, message) \
-    NV_IF_ELSE_TARGET(                      \
-      NV_IS_DEVICE, (_CCCL_ASSERT_DEVICE(expression, message);), (_CCCL_ASSERT_HOST(expression, message);))
+#if _CCCL_CUDA_COMPILER(NVHPC) // NVHPC can't have different behavior for host and device.
+                               // The host version of the assert will also work in device code.
+#  define _CCCL_VERIFY(expression, message) _CCCL_ASSERT_IMPL_HOST(expression, message)
+#  if defined(CCCL_ENABLE_HOST_ASSERTIONS) || defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
+#    define _CCCL_ASSERT(expression, message) _CCCL_ASSERT_HOST(expression, message)
+#  else
+#    define _CCCL_ASSERT(expression, message) ((void) 0)
+#  endif
 #elif _CCCL_HAS_CUDA_COMPILER
 #  ifdef __CUDA_ARCH__
 #    define _CCCL_VERIFY(expression, message) _CCCL_ASSERT_IMPL_DEVICE(expression, message)
diff --git a/libcudacxx/include/cuda/std/__cccl/attributes.h b/libcudacxx/include/cuda/std/__cccl/attributes.h
index a5888cc289e..79f9cadbdc1 100644
--- a/libcudacxx/include/cuda/std/__cccl/attributes.h
+++ b/libcudacxx/include/cuda/std/__cccl/attributes.h
@@ -134,4 +134,10 @@
 #  define _CCCL_RESTRICT __restrict__
 #endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^
 
+#if _CCCL_HAS_CPP_ATTRIBUTE(assume)
+#  define _CCCL_ASSUME(...) [[assume(__VA_ARGS__)]]
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
+#  define _CCCL_ASSUME(...) _CCCL_BUILTIN_ASSUME(__VA_ARGS__)
+#endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^
+
 #endif // __CCCL_ATTRIBUTES_H
diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index 3a5fda2f0f5..aee334b5562 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -101,10 +101,22 @@
 #  define _CCCL_BUILTIN_ADDRESSOF(...) __builtin_addressof(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_addressof)
 
-#if _CCCL_CHECK_BUILTIN(builtin_assume)
+#if _CCCL_CHECK_BUILTIN(builtin_assume) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)
 #  define _CCCL_BUILTIN_ASSUME(...) __builtin_assume(__VA_ARGS__)
+#elif _CCCL_COMPILER(GCC, >=, 13)
+#  define _CCCL_BUILTIN_ASSUME(...) \
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);), (__attribute__((__assume__(__VA_ARGS__)));))
+#elif _CCCL_COMPILER(MSVC)
+#  define _CCCL_BUILTIN_ASSUME(...) \
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);), (__assume(__VA_ARGS__);))
 #endif // _CCCL_CHECK_BUILTIN(builtin_assume)
 
+#if _CCCL_CHECK_BUILTIN(builtin_prefetch) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_PREFETCH(...) NV_IF_TARGET(NV_IS_HOST, __builtin_prefetch(__VA_ARGS__);)
+#else
+#  define _CCCL_BUILTIN_PREFETCH(...)
+#endif // _CCCL_CHECK_BUILTIN(builtin_prefetch)
+
 // NVCC prior to 11.2 cannot handle __builtin_assume
 #if _CCCL_CUDACC_BELOW(11, 2)
 #  undef _CCCL_BUILTIN_ASSUME
@@ -150,6 +162,33 @@
 #  undef _CCCL_BUILTIN_BSWAP128
 #endif // _CCCL_CUDA_COMPILER(NVCC)
 
+#if _CCCL_CHECK_BUILTIN(builtin_cbrt) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_CBRTF(...) __builtin_cbrtf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_CBRT(...)  __builtin_cbrt(__VA_ARGS__)
+#  define _CCCL_BUILTIN_CBRTL(...) __builtin_cbrtl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_cbrt)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "cbrt"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_CBRTF
+#  undef _CCCL_BUILTIN_CBRT
+#  undef _CCCL_BUILTIN_CBRTL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_ceil) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_CEILF(...) __builtin_ceilf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_CEIL(...)  __builtin_ceil(__VA_ARGS__)
+#  define _CCCL_BUILTIN_CEILL(...) __builtin_ceill(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_ceil)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_CEILF
+#  undef _CCCL_BUILTIN_CEIL
+#  undef _CCCL_BUILTIN_CEILL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
 #if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_COLUMN() __builtin_COLUMN()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_COLUMN) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_COLUMN) vvv
@@ -162,14 +201,69 @@
 #  define _CCCL_BUILTIN_COLUMN() 0
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
-#if _CCCL_CHECK_BUILTIN(builtin_contant_p) || _CCCL_COMPILER(GCC)
+#if _CCCL_CHECK_BUILTIN(builtin_constant_p) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(builtin_contant_p)
+#endif // _CCCL_CHECK_BUILTIN(builtin_constant_p)
+
+#if _CCCL_CHECK_BUILTIN(builtin_exp) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_EXPF(...) __builtin_expf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_EXP(...)  __builtin_exp(__VA_ARGS__)
+#  define _CCCL_BUILTIN_EXPL(...) __builtin_expl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_exp)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "expf"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_EXPF
+#  undef _CCCL_BUILTIN_EXP
+#  undef _CCCL_BUILTIN_EXPL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_exp2) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_EXP2F(...) __builtin_exp2f(__VA_ARGS__)
+#  define _CCCL_BUILTIN_EXP2(...)  __builtin_exp2(__VA_ARGS__)
+#  define _CCCL_BUILTIN_EXP2L(...) __builtin_exp2l(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_exp2)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "exp2"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_EXP2F
+#  undef _CCCL_BUILTIN_EXP2
+#  undef _CCCL_BUILTIN_EXP2L
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_expm1) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_EXPM1F(...) __builtin_expm1f(__VA_ARGS__)
+#  define _CCCL_BUILTIN_EXPM1(...)  __builtin_expm1(__VA_ARGS__)
+#  define _CCCL_BUILTIN_EXPM1L(...) __builtin_expm1l(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_expm1)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "expm1"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_EXPM1F
+#  undef _CCCL_BUILTIN_EXPM1
+#  undef _CCCL_BUILTIN_EXPM1L
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
 
 #if _CCCL_CHECK_BUILTIN(builtin_expect) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_EXPECT(...) __builtin_expect(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_expect)
 
+#if _CCCL_CHECK_BUILTIN(builtin_floor) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_FLOORF(...) __builtin_floorf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FLOOR(...)  __builtin_floor(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FLOORL(...) __builtin_floorl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_floor)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_FLOORF
+#  undef _CCCL_BUILTIN_FLOOR
+#  undef _CCCL_BUILTIN_FLOORL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
 #if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_FMAXF(...) __builtin_fmaxf(__VA_ARGS__)
 #  define _CCCL_BUILTIN_FMAX(...)  __builtin_fmax(__VA_ARGS__)
@@ -217,6 +311,20 @@
 #  undef _CCCL_BUILTIN_FPCLASSIFY
 #endif // _CCCL_CUDACC_BELOW(11, 7)
 
+#if _CCCL_CHECK_BUILTIN(builtin_frexp) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_FREXPF(...) __builtin_frexpf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FREXP(...)  __builtin_frexp(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FREXPL(...) __builtin_frexpl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_frexp)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "frexp"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_FREXPF
+#  undef _CCCL_BUILTIN_FREXP
+#  undef _CCCL_BUILTIN_FREXPL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
 #if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_FUNCTION() __builtin_FUNCTION()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FUNCTION) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FUNCTION) vvv
@@ -229,6 +337,20 @@
 #  define _CCCL_BUILTIN_FUNCTION() "__builtin_FUNCTION is unsupported"
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
+#if _CCCL_CHECK_BUILTIN(builtin_huge_valf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_HUGE_VALF() __builtin_huge_valf()
+#endif // _CCCL_CHECK_BUILTIN(builtin_huge_valf)
+
+#if _CCCL_CHECK_BUILTIN(builtin_huge_val) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_HUGE_VAL() __builtin_huge_val()
+#endif // _CCCL_CHECK_BUILTIN(builtin_huge_val)
+
+#if _CCCL_CHECK_BUILTIN(builtin_huge_vall) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_HUGE_VALL() __builtin_huge_vall()
+#elif _CCCL_COMPILER(MSVC)
+#  define _CCCL_BUILTIN_HUGE_VALL() static_cast<long double>(__builtin_huge_val())
+#endif // _CCCL_CHECK_BUILTIN(builtin_huge_vall)
+
 #if _CCCL_CHECK_BUILTIN(builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \
   || (_CCCL_COMPILER(MSVC, >, 19, 24) && _CCCL_CUDACC_AT_LEAST(11, 3))
 #  define _CCCL_BUILTIN_IS_CONSTANT_EVALUATED(...) __builtin_is_constant_evaluated(__VA_ARGS__)
@@ -276,6 +398,20 @@
 #  undef _CCCL_BUILTIN_LAUNDER
 #endif // clang < 10 || nvcc < 11.3
 
+#if _CCCL_CHECK_BUILTIN(builtin_ldexp) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LDEXPF(...) __builtin_ldexpf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LDEXP(...)  __builtin_ldexp(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LDEXPL(...) __builtin_ldexpl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_ldexp)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "ldexp"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_LDEXPF
+#  undef _CCCL_BUILTIN_LDEXP
+#  undef _CCCL_BUILTIN_LDEXPL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
 #if _CCCL_HAS_BUILTIN(__builtin_LINE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_LINE() __builtin_LINE()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_LINE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_LINE) vvv
@@ -288,19 +424,60 @@
 #  define _CCCL_BUILTIN_LINE() __LINE__
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
-#if _CCCL_CHECK_BUILTIN(builtin_huge_valf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
-#  define _CCCL_BUILTIN_HUGE_VALF() __builtin_huge_valf()
-#endif // _CCCL_CHECK_BUILTIN(builtin_huge_valf)
+#if _CCCL_CHECK_BUILTIN(builtin_llrint) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LLRINTF(...) __builtin_llrintf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LLRINT(...)  __builtin_llrint(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LLRINTL(...) __builtin_llrintl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_llrint)
 
-#if _CCCL_CHECK_BUILTIN(builtin_huge_val) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
-#  define _CCCL_BUILTIN_HUGE_VAL() __builtin_huge_val()
-#endif // _CCCL_CHECK_BUILTIN(builtin_huge_val)
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "llrint"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_LLRINTF
+#  undef _CCCL_BUILTIN_LLRINT
+#  undef _CCCL_BUILTIN_LLRINTL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
 
-#if _CCCL_CHECK_BUILTIN(builtin_huge_vall) || _CCCL_COMPILER(GCC, <, 10)
-#  define _CCCL_BUILTIN_HUGE_VALL() __builtin_huge_vall()
-#elif _CCCL_COMPILER(MSVC)
-#  define _CCCL_BUILTIN_HUGE_VALL() static_cast<long double>(__builtin_huge_val())
-#endif // _CCCL_CHECK_BUILTIN(builtin_huge_vall)
+#if _CCCL_CHECK_BUILTIN(builtin_llround) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LLROUNDF(...) __builtin_llroundf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LLROUND(...)  __builtin_llround(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LLROUNDL(...) __builtin_llroundl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_llround)
+
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "llround"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_LLROUNDF
+#  undef _CCCL_BUILTIN_LLROUND
+#  undef _CCCL_BUILTIN_LLROUNDL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_lrint) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LRINTF(...) __builtin_lrintf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LRINT(...)  __builtin_lrint(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LRINTL(...) __builtin_lrintl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_lrint)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "lrint"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_LRINTF
+#  undef _CCCL_BUILTIN_LRINT
+#  undef _CCCL_BUILTIN_LRINTL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_lround) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LROUNDF(...) __builtin_lroundf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LROUND(...)  __builtin_lround(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LROUNDL(...) __builtin_lroundl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_lround)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "lround"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_LROUNDF
+#  undef _CCCL_BUILTIN_LROUND
+#  undef _CCCL_BUILTIN_LROUNDL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
 
 #if _CCCL_CHECK_BUILTIN(builtin_nanf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
 #  define _CCCL_BUILTIN_NANF(...) __builtin_nanf(__VA_ARGS__)
@@ -330,6 +507,46 @@
 #  define _CCCL_BUILTIN_NANSL(...) static_cast<long double>(__builtin_nans(__VA_ARGS__))
 #endif // _CCCL_CHECK_BUILTIN(builtin_nansl)
 
+#if _CCCL_CHECK_BUILTIN(builtin_nearbyint) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_NEARBYINTF(...) __builtin_nearbyintf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_NEARBYINT(...)  __builtin_nearbyint(__VA_ARGS__)
+#  define _CCCL_BUILTIN_NEARBYINTL(...) __builtin_nearbyintl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_nearbyint)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_NEARBYINTF
+#  undef _CCCL_BUILTIN_NEARBYINT
+#  undef _CCCL_BUILTIN_NEARBYINTL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nextafter) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_NEXTAFTERF(...) __builtin_nextafterf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_NEXTAFTER(...)  __builtin_nextafter(__VA_ARGS__)
+#  define _CCCL_BUILTIN_NEXTAFTERL(...) __builtin_nextafterl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_nextafter)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "nextafter"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_NEXTAFTERF
+#  undef _CCCL_BUILTIN_NEXTAFTER
+#  undef _CCCL_BUILTIN_NEXTAFTERL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nexttoward) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_NEXTTOWARDF(...) __builtin_nexttowardf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_NEXTTOWARD(...)  __builtin_nexttoward(__VA_ARGS__)
+#  define _CCCL_BUILTIN_NEXTTOWARDL(...) __builtin_nexttowardl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_nexttoward)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_NEXTTOWARDF
+#  undef _CCCL_BUILTIN_NEXTTOWARD
+#  undef _CCCL_BUILTIN_NEXTTOWARDL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
 #if _CCCL_CHECK_BUILTIN(builtin_log) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_LOGF(...) __builtin_logf(__VA_ARGS__)
 #  define _CCCL_BUILTIN_LOG(...)  __builtin_log(__VA_ARGS__)
@@ -356,7 +573,7 @@
 #  undef _CCCL_BUILTIN_LOG10F
 #  undef _CCCL_BUILTIN_LOG10
 #  undef _CCCL_BUILTIN_LOG10L
-#endif // _CCCL_CUDACC_BELOW(11, 7)
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
 
 #if _CCCL_CHECK_BUILTIN(builtin_ilogb) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_ILOGBF(...) __builtin_ilogbf(__VA_ARGS__)
@@ -398,7 +615,7 @@
 #  undef _CCCL_BUILTIN_LOG2F
 #  undef _CCCL_BUILTIN_LOG2
 #  undef _CCCL_BUILTIN_LOG2L
-#endif // _CCCL_CUDACC_BELOW(11, 7)
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
 
 #if _CCCL_CHECK_BUILTIN(builtin_logb) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_LOGBF(...) __builtin_logbf(__VA_ARGS__)
@@ -420,6 +637,74 @@
 #  define _CCCL_BUILTIN_OPERATOR_NEW(...)    __builtin_operator_new(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete)
 
+#if _CCCL_CHECK_BUILTIN(builtin_pow) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_POWF(...) __builtin_powf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_POW(...)  __builtin_pow(__VA_ARGS__)
+#  define _CCCL_BUILTIN_POWL(...) __builtin_powl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_pow)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "pow"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_POWF
+#  undef _CCCL_BUILTIN_POW
+#  undef _CCCL_BUILTIN_POWL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_rint) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_RINTF(...) __builtin_rintf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_RINT(...)  __builtin_rint(__VA_ARGS__)
+#  define _CCCL_BUILTIN_RINTL(...) __builtin_rintl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_rint)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_RINTF
+#  undef _CCCL_BUILTIN_RINT
+#  undef _CCCL_BUILTIN_RINTL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_round) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_ROUNDF(...) __builtin_roundf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_ROUND(...)  __builtin_round(__VA_ARGS__)
+#  define _CCCL_BUILTIN_ROUNDL(...) __builtin_roundl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_round)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_ROUNDF
+#  undef _CCCL_BUILTIN_ROUND
+#  undef _CCCL_BUILTIN_ROUNDL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_scalbln) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SCALBLNF(...) __builtin_scalblnf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_SCALBLN(...)  __builtin_scalbln(__VA_ARGS__)
+#  define _CCCL_BUILTIN_SCALBLNL(...) __builtin_scalblnl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_scalbln)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "scalblnf"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_SCALBLNF
+#  undef _CCCL_BUILTIN_SCALBLN
+#  undef _CCCL_BUILTIN_SCALBLNL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
+#if _CCCL_CHECK_BUILTIN(builtin_scalbn) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SCALBNF(...) __builtin_scalbnf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_SCALBN(...)  __builtin_scalbn(__VA_ARGS__)
+#  define _CCCL_BUILTIN_SCALBNL(...) __builtin_scalbnl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_scalbn)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "scalbnf"
+#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_SCALBNF
+#  undef _CCCL_BUILTIN_SCALBN
+#  undef _CCCL_BUILTIN_SCALBNL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG)
+
 #if _CCCL_CHECK_BUILTIN(builtin_signbit) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_SIGNBIT(...) __builtin_signbit(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_signbit)
@@ -429,6 +714,32 @@
 #  undef _CCCL_BUILTIN_SIGNBIT
 #endif // _CCCL_CUDACC_BELOW(11, 7)
 
+#if _CCCL_CHECK_BUILTIN(builtin_sqrt) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SQRTF(...) __builtin_sqrtf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_SQRT(...)  __builtin_sqrt(__VA_ARGS__)
+#  define _CCCL_BUILTIN_SQRTL(...) __builtin_sqrtl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_sqrt)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_SQRTF
+#  undef _CCCL_BUILTIN_SQRT
+#  undef _CCCL_BUILTIN_SQRTL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_trunc) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_TRUNCF(...) __builtin_truncf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_TRUNC(...)  __builtin_trunc(__VA_ARGS__)
+#  define _CCCL_BUILTIN_TRUNCL(...) __builtin_truncl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_trunc)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_TRUNCF
+#  undef _CCCL_BUILTIN_TRUNC
+#  undef _CCCL_BUILTIN_TRUNCL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
 #if _CCCL_HAS_BUILTIN(__decay) && _CCCL_CUDA_COMPILER(CLANG)
 #  define _CCCL_BUILTIN_DECAY(...) __decay(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__decay) && clang-cuda
diff --git a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
index 5169ea4ad67..dee553633d8 100644
--- a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
@@ -39,4 +39,14 @@
 #  endif
 #endif // !_CCCL_HAS_NVBF16
 
+#if !defined(_CCCL_DISABLE_NVFP8_SUPPORT)
+#  if _CCCL_HAS_INCLUDE(<cuda_fp8.h>) && defined(_CCCL_HAS_NVFP16) && defined(_CCCL_HAS_NVBF16)
+#    define _CCCL_HAS_NVFP8() 1
+#  else
+#    define _CCCL_HAS_NVFP8() 0
+#  endif // _CCCL_HAS_INCLUDE(<cuda_fp8.h>)
+#else
+#  define _CCCL_HAS_NVFP8() 0
+#endif // !defined(_CCCL_DISABLE_NVFP8_SUPPORT)
+
 #endif // __CCCL_EXTENDED_FLOATING_POINT_H
diff --git a/libcudacxx/include/cuda/std/__cmath/exponential_functions.h b/libcudacxx/include/cuda/std/__cmath/exponential_functions.h
new file mode 100644
index 00000000000..f00f1807834
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/exponential_functions.h
@@ -0,0 +1,611 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___MATH_EXPONENTIAL_FUNCTIONS_H
+#define _LIBCUDACXX___MATH_EXPONENTIAL_FUNCTIONS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/promote.h>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// exp
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float exp(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXPF)
+  return _CCCL_BUILTIN_EXPF(__x);
+#else // ^^^ _CCCL_BUILTIN_EXPF ^^^ // vvv !_CCCL_BUILTIN_EXPF vvv
+  return ::expf(__x);
+#endif // !_CCCL_BUILTIN_EXPF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float expf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXPF)
+  return _CCCL_BUILTIN_EXPF(__x);
+#else // ^^^ _CCCL_BUILTIN_EXPF ^^^ // vvv !_CCCL_BUILTIN_EXPF vvv
+  return ::expf(__x);
+#endif // !_CCCL_BUILTIN_EXPF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXP)
+  return _CCCL_BUILTIN_EXP(__x);
+#else // ^^^ _CCCL_BUILTIN_EXP ^^^ // vvv !_CCCL_BUILTIN_EXP vvv
+  return ::exp(__x);
+#endif // !_CCCL_BUILTIN_EXP
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double exp(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_EXPL)
+  return _CCCL_BUILTIN_EXPL(__x);
+#  else // ^^^ _CCCL_BUILTIN_EXPL ^^^ // vvv !_CCCL_BUILTIN_EXPL vvv
+  return ::expl(__x);
+#  endif // !_CCCL_BUILTIN_EXPL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double expl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_EXPL)
+  return _CCCL_BUILTIN_EXPL(__x);
+#  else // ^^^ _CCCL_BUILTIN_EXPL ^^^ // vvv !_CCCL_BUILTIN_EXPL vvv
+  return ::expl(__x);
+#  endif // !_CCCL_BUILTIN_EXPL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half exp(__half __x) noexcept
+{
+  {
+    NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (return ::hexp(__x);), ({
+                        float __xf            = __half2float(__x);
+                        __xf                  = ::expf(__xf);
+                        __half_raw __ret_repr = ::__float2half_rn(__xf);
+
+                        uint16_t __repr = __half_raw(__x).x;
+                        switch (__repr)
+                        {
+                          case 8057:
+                          case 9679:
+                            __ret_repr.x -= 1;
+                            break;
+
+                          default:;
+                        }
+
+                        return __ret_repr;
+                      }))
+  }
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 exp(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hexp(__x);), (return __float2bfloat16(_CUDA_VSTD::expf(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::exp((double) __x);
+}
+
+// frexp
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float frexp(float __x, int* __e) noexcept
+{
+#if defined(_CCCL_BUILTIN_FREXPF)
+  return _CCCL_BUILTIN_FREXPF(__x, __e);
+#else // ^^^ _CCCL_BUILTIN_FREXPF ^^^ // vvv !_CCCL_BUILTIN_FREXPF vvv
+  return ::frexpf(__x, __e);
+#endif // !_CCCL_BUILTIN_FREXPF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float frexpf(float __x, int* __e) noexcept
+{
+#if defined(_CCCL_BUILTIN_FREXPF)
+  return _CCCL_BUILTIN_FREXPF(__x, __e);
+#else // ^^^ _CCCL_BUILTIN_FREXPF ^^^ // vvv !_CCCL_BUILTIN_FREXPF vvv
+  return ::frexpf(__x, __e);
+#endif // !_CCCL_BUILTIN_FREXPF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double frexp(double __x, int* __e) noexcept
+{
+#if defined(_CCCL_BUILTIN_FREXP)
+  return _CCCL_BUILTIN_FREXP(__x, __e);
+#else // ^^^ _CCCL_BUILTIN_FREXP ^^^ // vvv !_CCCL_BUILTIN_FREXP vvv
+  return ::frexp(__x, __e);
+#endif // !_CCCL_BUILTIN_FREXP
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double frexp(long double __x, int* __e) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FREXPL)
+  return _CCCL_BUILTIN_FREXPL(__x, __e);
+#  else // ^^^ _CCCL_BUILTIN_FREXPL ^^^ // vvv !_CCCL_BUILTIN_FREXPL vvv
+  return ::frexpl(__x, __e);
+#  endif // !_CCCL_BUILTIN_FREXPL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double frexpl(long double __x, int* __e) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FREXPL)
+  return _CCCL_BUILTIN_FREXPL(__x, __e);
+#  else // ^^^ _CCCL_BUILTIN_FREXPL ^^^ // vvv !_CCCL_BUILTIN_FREXPL vvv
+  return ::frexpl(__x, __e);
+#  endif // !_CCCL_BUILTIN_FREXPL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half frexp(__half __x, int* __e) noexcept
+{
+  return __float2half(_CUDA_VSTD::frexpf(__half2float(__x), __e));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 frexp(__nv_bfloat16 __x, int* __e) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::frexpf(__bfloat162float(__x), __e));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double frexp(_Integer __x, int* __e) noexcept
+{
+  return _CUDA_VSTD::frexp((double) __x, __e);
+}
+
+// ldexp
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ldexp(float __x, int __e) noexcept
+{
+#if defined(_CCCL_BUILTIN_LDEXPF)
+  return _CCCL_BUILTIN_LDEXPF(__x, __e);
+#else // ^^^ _CCCL_BUILTIN_LDEXPF ^^^ // vvv !_CCCL_BUILTIN_LDEXPF vvv
+  return ::ldexpf(__x, __e);
+#endif // !_CCCL_BUILTIN_LDEXPF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ldexpf(float __x, int __e) noexcept
+{
+#if defined(_CCCL_BUILTIN_LDEXPF)
+  return _CCCL_BUILTIN_LDEXPF(__x, __e);
+#else // ^^^ _CCCL_BUILTIN_LDEXPF ^^^ // vvv !_CCCL_BUILTIN_LDEXPF vvv
+  return ::ldexpf(__x, __e);
+#endif // !_CCCL_BUILTIN_LDEXPF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ldexp(double __x, int __e) noexcept
+{
+#if defined(_CCCL_BUILTIN_LDEXP)
+  return _CCCL_BUILTIN_LDEXP(__x, __e);
+#else // ^^^ _CCCL_BUILTIN_LDEXP ^^^ // vvv !_CCCL_BUILTIN_LDEXP vvv
+  return ::ldexp(__x, __e);
+#endif // !_CCCL_BUILTIN_LDEXP
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ldexp(long double __x, int __e) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LDEXPL)
+  return _CCCL_BUILTIN_LDEXPL(__x, __e);
+#  else // ^^^ _CCCL_BUILTIN_LDEXPL ^^^ // vvv !_CCCL_BUILTIN_LDEXPL vvv
+  return ::ldexpl(__x, __e);
+#  endif // !_CCCL_BUILTIN_LDEXPL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ldexpl(long double __x, int __e) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LDEXPL)
+  return _CCCL_BUILTIN_LDEXPL(__x, __e);
+#  else // ^^^ _CCCL_BUILTIN_LDEXPL ^^^ // vvv !_CCCL_BUILTIN_LDEXPL vvv
+  return ::ldexpl(__x, __e);
+#  endif // !_CCCL_BUILTIN_LDEXPL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half ldexp(__half __x, int __e) noexcept
+{
+  return __float2half(_CUDA_VSTD::ldexpf(__half2float(__x), __e));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 ldexp(__nv_bfloat16 __x, int __e) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::ldexpf(__bfloat162float(__x), __e));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ldexp(_Integer __x, int __e) noexcept
+{
+  return _CUDA_VSTD::ldexp((double) __x, __e);
+}
+
+// exp2
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float exp2(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXP2F)
+  return _CCCL_BUILTIN_EXP2F(__x);
+#else // ^^^ _CCCL_BUILTIN_EXP2F ^^^ // vvv !_CCCL_BUILTIN_EXP2F vvv
+  return ::exp2f(__x);
+#endif // !_CCCL_BUILTIN_EXP2F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float exp2f(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXP2F)
+  return _CCCL_BUILTIN_EXP2F(__x);
+#else // ^^^ _CCCL_BUILTIN_EXP2F ^^^ // vvv !_CCCL_BUILTIN_EXP2F vvv
+  return ::exp2f(__x);
+#endif // !_CCCL_BUILTIN_EXP2F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp2(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXP2)
+  return _CCCL_BUILTIN_EXP2(__x);
+#else // ^^^ _CCCL_BUILTIN_EXP2 ^^^ // vvv !_CCCL_BUILTIN_EXP2 vvv
+  return ::exp2(__x);
+#endif // !_CCCL_BUILTIN_EXP2
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double exp2(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_EXP2L)
+  return _CCCL_BUILTIN_EXP2L(__x);
+#  else // ^^^ _CCCL_BUILTIN_EXP2L ^^^ // vvv !_CCCL_BUILTIN_EXP2L vvv
+  return ::exp2l(__x);
+#  endif // !_CCCL_BUILTIN_EXP2L
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double exp2l(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_EXP2L)
+  return _CCCL_BUILTIN_EXP2L(__x);
+#  else // ^^^ _CCCL_BUILTIN_EXP2L ^^^ // vvv !_CCCL_BUILTIN_EXP2L vvv
+  return ::exp2l(__x);
+#  endif // !_CCCL_BUILTIN_EXP2L
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half exp2(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp2(__x);), (return __float2half(_CUDA_VSTD::exp2f(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 exp2(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hexp2(__x);), (return __float2bfloat16(_CUDA_VSTD::exp2f(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp2(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::exp2((double) __x);
+}
+
+// expm1
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float expm1(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXPM1F)
+  return _CCCL_BUILTIN_EXPM1F(__x);
+#else // ^^^ _CCCL_BUILTIN_EXPM1F ^^^ // vvv !_CCCL_BUILTIN_EXPM1F vvv
+  return ::expm1f(__x);
+#endif // !_CCCL_BUILTIN_EXPM1F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float expm1f(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXPM1F)
+  return _CCCL_BUILTIN_EXPM1F(__x);
+#else // ^^^ _CCCL_BUILTIN_EXPM1F ^^^ // vvv !_CCCL_BUILTIN_EXPM1F vvv
+  return ::expm1f(__x);
+#endif // !_CCCL_BUILTIN_EXPM1F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double expm1(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_EXPM1)
+  return _CCCL_BUILTIN_EXPM1(__x);
+#else // ^^^ _CCCL_BUILTIN_EXPM1 ^^^ // vvv !_CCCL_BUILTIN_EXPM1 vvv
+  return ::expm1(__x);
+#endif // !_CCCL_BUILTIN_EXPM1
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double expm1(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_EXPM1L)
+  return _CCCL_BUILTIN_EXPM1L(__x);
+#  else // ^^^ _CCCL_BUILTIN_EXPM1L ^^^ // vvv !_CCCL_BUILTIN_EXPM1L vvv
+  return ::expm1l(__x);
+#  endif // !_CCCL_BUILTIN_EXPM1L
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double expm1l(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_EXPM1L)
+  return _CCCL_BUILTIN_EXPM1L(__x);
+#  else // ^^^ _CCCL_BUILTIN_EXPM1L ^^^ // vvv !_CCCL_BUILTIN_EXPM1L vvv
+  return ::expm1l(__x);
+#  endif // !_CCCL_BUILTIN_EXPM1L
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half expm1(__half __x) noexcept
+{
+  return __float2half(_CUDA_VSTD::expm1f(__half2float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 expm1(__nv_bfloat16 __x) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::expm1f(__bfloat162float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double expm1(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::expm1((double) __x);
+}
+
+// scalbln
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalbln(float __x, long __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_SCALBLNF)
+  return _CCCL_BUILTIN_SCALBLNF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_SCALBLNF ^^^ // vvv !_CCCL_BUILTIN_SCALBLNF vvv
+  return ::scalblnf(__x, __y);
+#endif // !_CCCL_BUILTIN_SCALBLNF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalblnf(float __x, long __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_SCALBLNF)
+  return _CCCL_BUILTIN_SCALBLNF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_SCALBLNF ^^^ // vvv !_CCCL_BUILTIN_SCALBLNF vvv
+  return ::scalblnf(__x, __y);
+#endif // !_CCCL_BUILTIN_SCALBLNF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbln(double __x, long __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_SCALBLN)
+  return _CCCL_BUILTIN_SCALBLN(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_SCALBLN ^^^ // vvv !_CCCL_BUILTIN_SCALBLN vvv
+  return ::scalbln(__x, __y);
+#endif // !_CCCL_BUILTIN_SCALBLN
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalbln(long double __x, long __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_SCALBLNL)
+  return _CCCL_BUILTIN_SCALBLNL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_SCALBLNL ^^^ // vvv !_CCCL_BUILTIN_SCALBLNL vvv
+  return ::scalblnl(__x, __y);
+#  endif // !_CCCL_BUILTIN_SCALBLNL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalblnl(long double __x, long __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_SCALBLNL)
+  return _CCCL_BUILTIN_SCALBLNL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_SCALBLNL ^^^ // vvv !_CCCL_BUILTIN_SCALBLNL vvv
+  return ::scalblnl(__x, __y);
+#  endif // !_CCCL_BUILTIN_SCALBLNL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half scalbln(__half __x, long __y) noexcept
+{
+  return __float2half(_CUDA_VSTD::scalblnf(__half2float(__x), __y));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 scalbln(__nv_bfloat16 __x, long __y) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::scalblnf(__bfloat162float(__x), __y));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbln(_Integer __x, long __y) noexcept
+{
+  return _CUDA_VSTD::scalbln((double) __x, __y);
+}
+
+// scalbn
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalbn(float __x, int __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_SCALBNF)
+  return _CCCL_BUILTIN_SCALBNF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_SCALBNF ^^^ // vvv !_CCCL_BUILTIN_SCALBNF vvv
+  return ::scalbnf(__x, __y);
+#endif // !_CCCL_BUILTIN_SCALBNF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalbnf(float __x, int __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_SCALBNF)
+  return _CCCL_BUILTIN_SCALBNF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_SCALBNF ^^^ // vvv !_CCCL_BUILTIN_SCALBNF vvv
+  return ::scalbnf(__x, __y);
+#endif // !_CCCL_BUILTIN_SCALBNF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbn(double __x, int __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_SCALBN)
+  return _CCCL_BUILTIN_SCALBN(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_SCALBN ^^^ // vvv !_CCCL_BUILTIN_SCALBN vvv
+  return ::scalbn(__x, __y);
+#endif // !_CCCL_BUILTIN_SCALBN
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalbn(long double __x, int __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_SCALBNL)
+  return _CCCL_BUILTIN_SCALBNL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_SCALBNL ^^^ // vvv !_CCCL_BUILTIN_SCALBNL vvv
+  return ::scalbnl(__x, __y);
+#  endif // !_CCCL_BUILTIN_SCALBNL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalbnl(long double __x, int __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_SCALBNL)
+  return _CCCL_BUILTIN_SCALBNL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_SCALBNL ^^^ // vvv !_CCCL_BUILTIN_SCALBNL vvv
+  return ::scalbnl(__x, __y);
+#  endif // !_CCCL_BUILTIN_SCALBNL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half scalbn(__half __x, int __y) noexcept
+{
+  return __float2half(_CUDA_VSTD::scalbnf(__half2float(__x), __y));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 scalbn(__nv_bfloat16 __x, int __y) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::scalbnf(__bfloat162float(__x), __y));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbn(_Integer __x, int __y) noexcept
+{
+  return _CUDA_VSTD::scalbn((double) __x, __y);
+}
+
+// pow
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float pow(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_POWF)
+  return _CCCL_BUILTIN_POWF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_POWF ^^^ // vvv !_CCCL_BUILTIN_POWF vvv
+  return ::powf(__x, __y);
+#endif // !_CCCL_BUILTIN_POWF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float powf(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_POWF)
+  return _CCCL_BUILTIN_POWF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_POWF ^^^ // vvv !_CCCL_BUILTIN_POWF vvv
+  return ::powf(__x, __y);
+#endif // !_CCCL_BUILTIN_POWF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double pow(double __x, double __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_POW)
+  return _CCCL_BUILTIN_POW(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_POW ^^^ // vvv !_CCCL_BUILTIN_POW vvv
+  return ::pow(__x, __y);
+#endif // !_CCCL_BUILTIN_POW
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double pow(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_POWL)
+  return _CCCL_BUILTIN_POWL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_POWL ^^^ // vvv !_CCCL_BUILTIN_POWL vvv
+  return ::powl(__x, __y);
+#  endif // !_CCCL_BUILTIN_POWL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double powl(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_POWL)
+  return _CCCL_BUILTIN_POWL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_POWL ^^^ // vvv !_CCCL_BUILTIN_POWL vvv
+  return ::powl(__x, __y);
+#  endif // !_CCCL_BUILTIN_POWL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half pow(__half __x, __half __y) noexcept
+{
+  return __float2half(_CUDA_VSTD::powf(__half2float(__x), __half2float(__y)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 pow(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::powf(__bfloat162float(__x), __bfloat162float(__y)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _A1, class _A2, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1) && _CCCL_TRAIT(is_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> pow(_A1 __x, _A2 __y) noexcept
+{
+  using __result_type = __promote_t<_A1, _A2>;
+  static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), "");
+  return _CUDA_VSTD::pow((__result_type) __x, (__result_type) __y);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___MATH_EXPONENTIAL_FUNCTIONS_H
diff --git a/libcudacxx/include/cuda/std/__cmath/nvbf16.h b/libcudacxx/include/cuda/std/__cmath/nvbf16.h
index 8f116968f8b..b0bda438e6e 100644
--- a/libcudacxx/include/cuda/std/__cmath/nvbf16.h
+++ b/libcudacxx/include/cuda/std/__cmath/nvbf16.h
@@ -55,11 +55,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 cosh(__nv_bfloat16 __v)
   return __float2bfloat16(::coshf(__bfloat162float(__v)));
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 exp(__nv_bfloat16 __v)
-{
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp(__v);), (return __float2bfloat16(::expf(__bfloat162float(__v)));))
-}
-
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 hypot(__nv_bfloat16 __x, __nv_bfloat16 __y)
 {
   return __float2bfloat16(::hypotf(__bfloat162float(__x), __bfloat162float(__y)));
@@ -70,11 +65,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 _
   return __float2bfloat16(::atan2f(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sqrt(__nv_bfloat16 __x)
-{
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(::sqrtf(__bfloat162float(__x)));))
-}
-
 // floating point helper
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
 {
diff --git a/libcudacxx/include/cuda/std/__cmath/nvfp16.h b/libcudacxx/include/cuda/std/__cmath/nvfp16.h
index dbcaebbb4ef..1f295088aaf 100644
--- a/libcudacxx/include/cuda/std/__cmath/nvfp16.h
+++ b/libcudacxx/include/cuda/std/__cmath/nvfp16.h
@@ -97,34 +97,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __half cosh(__half __v)
   return __float2half(::coshf(__half2float(__v)));
 }
 
-// clang-format off
-_LIBCUDACXX_HIDE_FROM_ABI  __half exp(__half __v)
-{
-  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (
-    return ::hexp(__v);
-  ), (
-    {
-      float __vf            = __half2float(__v);
-      __vf                  = ::expf(__vf);
-      __half_raw __ret_repr = ::__float2half_rn(__vf);
-
-      uint16_t __repr = __half_raw(__v).x;
-      switch (__repr)
-      {
-        case 8057:
-        case 9679:
-          __ret_repr.x -= 1;
-          break;
-
-        default:;
-      }
-
-      return __ret_repr;
-    }
-  ))
-}
-// clang-format on
-
 _LIBCUDACXX_HIDE_FROM_ABI __half hypot(__half __x, __half __y)
 {
   return __float2half(::hypotf(__half2float(__x), __half2float(__y)));
@@ -135,11 +107,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __half atan2(__half __x, __half __y)
   return __float2half(::atan2f(__half2float(__x), __half2float(__y)));
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI __half sqrt(__half __x)
-{
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(::sqrtf(__half2float(__x)));))
-}
-
 // floating point helper
 _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_copysign(__half __x, __half __y) noexcept
 {
diff --git a/libcudacxx/include/cuda/std/__cmath/roots.h b/libcudacxx/include/cuda/std/__cmath/roots.h
new file mode 100644
index 00000000000..0d2065dcf5a
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/roots.h
@@ -0,0 +1,171 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_ROOTS_H
+#define _LIBCUDACXX___CMATH_ROOTS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_integral.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// sqrt
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float sqrt(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_SQRTF)
+  return _CCCL_BUILTIN_SQRTF(__x);
+#else // ^^^ _CCCL_BUILTIN_SQRTF ^^^ // vvv !_CCCL_BUILTIN_SQRTF vvv
+  return ::sqrtf(__x);
+#endif // !_CCCL_BUILTIN_SQRTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float sqrtf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_SQRTF)
+  return _CCCL_BUILTIN_SQRTF(__x);
+#else // ^^^ _CCCL_BUILTIN_SQRTF ^^^ // vvv !_CCCL_BUILTIN_SQRTF vvv
+  return ::sqrtf(__x);
+#endif // !_CCCL_BUILTIN_SQRTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double sqrt(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_SQRT)
+  return _CCCL_BUILTIN_SQRT(__x);
+#else // ^^^ _CCCL_BUILTIN_SQRT ^^^ // vvv !_CCCL_BUILTIN_SQRT vvv
+  return ::sqrt(__x);
+#endif // !_CCCL_BUILTIN_SQRT
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double sqrt(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_SQRTL)
+  return _CCCL_BUILTIN_SQRTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_SQRTL ^^^ // vvv !_CCCL_BUILTIN_SQRTL vvv
+  return ::sqrtl(__x);
+#  endif // !_CCCL_BUILTIN_SQRTL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double sqrtl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_SQRTL)
+  return _CCCL_BUILTIN_SQRTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_SQRTL ^^^ // vvv !_CCCL_BUILTIN_SQRTL vvv
+  return ::sqrtl(__x);
+#  endif // !_CCCL_BUILTIN_SQRTL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half sqrt(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(_CUDA_VSTD::sqrt(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sqrt(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(_CUDA_VSTD::sqrt(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double sqrt(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::sqrt((double) __x);
+}
+
+// cbrt
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float cbrt(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_CBRTF)
+  return _CCCL_BUILTIN_CBRTF(__x);
+#else // ^^^ _CCCL_BUILTIN_CBRTF ^^^ // vvv !_CCCL_BUILTIN_CBRTF vvv
+  return ::cbrtf(__x);
+#endif // !_CCCL_BUILTIN_CBRTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float cbrtf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_CBRTF)
+  return _CCCL_BUILTIN_CBRTF(__x);
+#else // ^^^ _CCCL_BUILTIN_CBRTF ^^^ // vvv !_CCCL_BUILTIN_CBRTF vvv
+  return ::cbrtf(__x);
+#endif // !_CCCL_BUILTIN_CBRTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double cbrt(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_CBRT)
+  return _CCCL_BUILTIN_CBRT(__x);
+#else // ^^^ _CCCL_BUILTIN_CBRT ^^^ // vvv !_CCCL_BUILTIN_CBRT vvv
+  return ::cbrt(__x);
+#endif // !_CCCL_BUILTIN_CBRT
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double cbrt(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_CBRTL)
+  return _CCCL_BUILTIN_CBRTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_CBRTL ^^^ // vvv !_CCCL_BUILTIN_CBRTL vvv
+  return ::cbrtl(__x);
+#  endif // !_CCCL_BUILTIN_CBRTL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double cbrtl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_CBRTL)
+  return _CCCL_BUILTIN_CBRTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_CBRTL ^^^ // vvv !_CCCL_BUILTIN_CBRTL vvv
+  return ::cbrtl(__x);
+#  endif // !_CCCL_BUILTIN_CBRTL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half cbrt(__half __x) noexcept
+{
+  return __float2half(_CUDA_VSTD::cbrt(__half2float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 cbrt(__nv_bfloat16 __x) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::cbrt(__bfloat162float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double cbrt(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::cbrt((double) __x);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___CMATH_ROOTS_H
diff --git a/libcudacxx/include/cuda/std/__cmath/rounding_functions.h b/libcudacxx/include/cuda/std/__cmath/rounding_functions.h
new file mode 100644
index 00000000000..4404ce446c4
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/rounding_functions.h
@@ -0,0 +1,868 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_ROUNDING_FUNCTIONS_H
+#define _LIBCUDACXX___CMATH_ROUNDING_FUNCTIONS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/promote.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// ceil
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ceil(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_CEILF)
+  return _CCCL_BUILTIN_CEILF(__x);
+#else // ^^^ _CCCL_BUILTIN_CEILF ^^^ // vvv !_CCCL_BUILTIN_CEILF vvv
+  return ::ceilf(__x);
+#endif // !_CCCL_BUILTIN_CEILF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ceilf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_CEILF)
+  return _CCCL_BUILTIN_CEILF(__x);
+#else // ^^^ _CCCL_BUILTIN_CEILF ^^^ // vvv !_CCCL_BUILTIN_CEILF vvv
+  return ::ceilf(__x);
+#endif // !_CCCL_BUILTIN_CEILF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ceil(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_CEIL)
+  return _CCCL_BUILTIN_CEIL(__x);
+#else // ^^^ _CCCL_BUILTIN_CEIL ^^^ // vvv !_CCCL_BUILTIN_CEIL vvv
+  return ::ceil(__x);
+#endif // !_CCCL_BUILTIN_CEIL
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ceil(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_CEILL)
+  return _CCCL_BUILTIN_CEILL(__x);
+#  else // ^^^ _CCCL_BUILTIN_CEILL ^^^ // vvv !_CCCL_BUILTIN_CEILL vvv
+  return ::ceill(__x);
+#  endif // !_CCCL_BUILTIN_CEILL
+}
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ceill(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_CEILL)
+  return _CCCL_BUILTIN_CEILL(__x);
+#  else // ^^^ _CCCL_BUILTIN_CEILL ^^^ // vvv !_CCCL_BUILTIN_CEILL vvv
+  return ::ceill(__x);
+#  endif // !_CCCL_BUILTIN_CEILL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half ceil(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hceil(__x);), (return __float2half(_CUDA_VSTD::ceil(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 ceil(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hceil(__x);), (return __float2bfloat16(_CUDA_VSTD::ceil(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ceil(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::ceil((double) __x);
+}
+
+// floor
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float floor(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_FLOORF)
+  return _CCCL_BUILTIN_FLOORF(__x);
+#else // ^^^ _CCCL_BUILTIN_FLOORF ^^^ // vvv !_CCCL_BUILTIN_FLOORF vvv
+  return ::floorf(__x);
+#endif // !_CCCL_BUILTIN_FLOORF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float floorf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_FLOORF)
+  return _CCCL_BUILTIN_FLOORF(__x);
+#else // ^^^ _CCCL_BUILTIN_FLOORF ^^^ // vvv !_CCCL_BUILTIN_FLOORF vvv
+  return ::floorf(__x);
+#endif // !_CCCL_BUILTIN_FLOORF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double floor(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_FLOOR)
+  return _CCCL_BUILTIN_FLOOR(__x);
+#else // ^^^ _CCCL_BUILTIN_FLOOR ^^^ // vvv !_CCCL_BUILTIN_FLOOR vvv
+  return ::floor(__x);
+#endif // !_CCCL_BUILTIN_FLOOR
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double floor(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FLOORL)
+  return _CCCL_BUILTIN_FLOORL(__x);
+#  else // ^^^ _CCCL_BUILTIN_FLOORL ^^^ // vvv !_CCCL_BUILTIN_FLOORL vvv
+  return ::floorl(__x);
+#  endif // !_CCCL_BUILTIN_FLOORL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double floorl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FLOORL)
+  return _CCCL_BUILTIN_FLOORL(__x);
+#  else // ^^^ _CCCL_BUILTIN_FLOORL ^^^ // vvv !_CCCL_BUILTIN_FLOORL vvv
+  return ::floorl(__x);
+#  endif // !_CCCL_BUILTIN_FLOORL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half floor(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hfloor(__x);), (return __float2half(_CUDA_VSTD::floor(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 floor(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hfloor(__x);), (return __float2bfloat16(_CUDA_VSTD::floor(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double floor(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::floor((double) __x);
+}
+
+// llrint
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llrint(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LLRINTF)
+  return _CCCL_BUILTIN_LLRINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_LLRINTF ^^^ // vvv !_CCCL_BUILTIN_LLRINTF vvv
+  return ::llrintf(__x);
+#endif // !_CCCL_BUILTIN_LLRINTF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llrintf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LLRINTF)
+  return _CCCL_BUILTIN_LLRINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_LLRINTF ^^^ // vvv !_CCCL_BUILTIN_LLRINTF vvv
+  return ::llrintf(__x);
+#endif // !_CCCL_BUILTIN_LLRINTF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llrint(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LLRINT)
+  return _CCCL_BUILTIN_LLRINT(__x);
+#else // ^^^ _CCCL_BUILTIN_LLRINT ^^^ // vvv !_CCCL_BUILTIN_LLRINT vvv
+  return ::llrint(__x);
+#endif // !_CCCL_BUILTIN_LLRINT
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_LIBCUDACXX_HIDE_FROM_ABI long long llrint(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LLRINTL)
+  return _CCCL_BUILTIN_LLRINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LLRINTL ^^^ // vvv !_CCCL_BUILTIN_LLRINTL vvv
+  return ::llrintl(__x);
+#  endif // !_CCCL_BUILTIN_LLRINTL
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llrintl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LLRINTL)
+  return _CCCL_BUILTIN_LLRINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LLRINTL ^^^ // vvv !_CCCL_BUILTIN_LLRINTL vvv
+  return ::llrintl(__x);
+#  endif // !_CCCL_BUILTIN_LLRINTL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llrint(__half __x) noexcept
+{
+  return _CUDA_VSTD::llrintf(__half2float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llrint(__nv_bfloat16 __x) noexcept
+{
+  return _CUDA_VSTD::llrintf(__bfloat162float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI long long llrint(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::llrint((double) __x);
+}
+
+// llround
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llround(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LLROUNDF)
+  return _CCCL_BUILTIN_LLROUNDF(__x);
+#else // ^^^ _CCCL_BUILTIN_LLROUNDF ^^^ // vvv !_CCCL_BUILTIN_LLROUNDF vvv
+  return ::llroundf(__x);
+#endif // !_CCCL_BUILTIN_LLROUNDF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llroundf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LLROUNDF)
+  return _CCCL_BUILTIN_LLROUNDF(__x);
+#else // ^^^ _CCCL_BUILTIN_LLROUNDF ^^^ // vvv !_CCCL_BUILTIN_LLROUNDF vvv
+  return ::llroundf(__x);
+#endif // !_CCCL_BUILTIN_LLROUNDF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llround(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LLROUND)
+  return _CCCL_BUILTIN_LLROUND(__x);
+#else // ^^^ _CCCL_BUILTIN_LLROUND ^^^ // vvv !_CCCL_BUILTIN_LLROUND vvv
+  return ::llround(__x);
+#endif // !_CCCL_BUILTIN_LLROUND
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_LIBCUDACXX_HIDE_FROM_ABI long long llround(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LLROUNDL)
+  return _CCCL_BUILTIN_LLROUNDL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LLROUNDL ^^^ // vvv !_CCCL_BUILTIN_LLROUNDL vvv
+  return ::llroundl(__x);
+#  endif // !_CCCL_BUILTIN_LLROUNDL
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long long llroundl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LLROUNDL)
+  return _CCCL_BUILTIN_LLROUNDL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LLROUNDL ^^^ // vvv !_CCCL_BUILTIN_LLROUNDL vvv
+  return ::llroundl(__x);
+#  endif // !_CCCL_BUILTIN_LLROUNDL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llround(__half __x) noexcept
+{
+  return _CUDA_VSTD::llroundf(__half2float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llround(__nv_bfloat16 __x) noexcept
+{
+  return _CUDA_VSTD::llroundf(__bfloat162float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI long long llround(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::llround((double) __x);
+}
+
+// lrint
+
+_LIBCUDACXX_HIDE_FROM_ABI long lrint(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LRINTF)
+  return _CCCL_BUILTIN_LRINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_LRINTF ^^^ // vvv !_CCCL_BUILTIN_LRINTF vvv
+  return ::lrintf(__x);
+#endif // !_CCCL_BUILTIN_LRINTF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long lrintf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LRINTF)
+  return _CCCL_BUILTIN_LRINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_LRINTF ^^^ // vvv !_CCCL_BUILTIN_LRINTF vvv
+  return ::lrintf(__x);
+#endif // !_CCCL_BUILTIN_LRINTF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long lrint(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LRINT)
+  return _CCCL_BUILTIN_LRINT(__x);
+#else // ^^^ _CCCL_BUILTIN_LRINT ^^^ // vvv !_CCCL_BUILTIN_LRINT vvv
+  return ::lrint(__x);
+#endif // !_CCCL_BUILTIN_LRINT
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_LIBCUDACXX_HIDE_FROM_ABI long lrint(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LRINTL)
+  return _CCCL_BUILTIN_LRINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LRINTL ^^^ // vvv !_CCCL_BUILTIN_LRINTL vvv
+  return ::lrintl(__x);
+#  endif // !_CCCL_BUILTIN_LRINTL
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long lrintl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LRINTL)
+  return _CCCL_BUILTIN_LRINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LRINTL ^^^ // vvv !_CCCL_BUILTIN_LRINTL vvv
+  return ::lrintl(__x);
+#  endif // !_CCCL_BUILTIN_LRINTL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lrint(__half __x) noexcept
+{
+  return _CUDA_VSTD::lrintf(__half2float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lrint(__nv_bfloat16 __x) noexcept
+{
+  return _CUDA_VSTD::lrintf(__bfloat162float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI long lrint(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::lrint((double) __x);
+}
+
+// lround
+
+_LIBCUDACXX_HIDE_FROM_ABI long lround(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LROUNDF)
+  return _CCCL_BUILTIN_LROUNDF(__x);
+#else // ^^^ _CCCL_BUILTIN_LROUNDF ^^^ // vvv !_CCCL_BUILTIN_LROUNDF vvv
+  return ::lroundf(__x);
+#endif // !_CCCL_BUILTIN_LROUNDF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long lroundf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LROUNDF)
+  return _CCCL_BUILTIN_LROUNDF(__x);
+#else // ^^^ _CCCL_BUILTIN_LROUNDF ^^^ // vvv !_CCCL_BUILTIN_LROUNDF vvv
+  return ::lroundf(__x);
+#endif // !_CCCL_BUILTIN_LROUNDF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long lround(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LROUND)
+  return _CCCL_BUILTIN_LROUND(__x);
+#else // ^^^ _CCCL_BUILTIN_LROUND ^^^ // vvv !_CCCL_BUILTIN_LROUND vvv
+  return ::lround(__x);
+#endif // !_CCCL_BUILTIN_LROUND
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_LIBCUDACXX_HIDE_FROM_ABI long lround(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LROUNDL)
+  return _CCCL_BUILTIN_LROUNDL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LROUNDL ^^^ // vvv !_CCCL_BUILTIN_LROUNDL vvv
+  return ::lroundl(__x);
+#  endif // !_CCCL_BUILTIN_LROUNDL
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long lroundl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LROUNDL)
+  return _CCCL_BUILTIN_LROUNDL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LROUNDL ^^^ // vvv !_CCCL_BUILTIN_LROUNDL vvv
+  return ::lroundl(__x);
+#  endif // !_CCCL_BUILTIN_LROUNDL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lround(__half __x) noexcept
+{
+  return _CUDA_VSTD::lroundf(__half2float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lround(__nv_bfloat16 __x) noexcept
+{
+  return _CUDA_VSTD::lroundf(__bfloat162float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI long lround(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::lround((double) __x);
+}
+
+// nearbyint
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float nearbyint(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_NEARBYINTF)
+  return _CCCL_BUILTIN_NEARBYINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_NEARBYINTF ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTF vvv
+  return ::nearbyintf(__x);
+#endif // !_CCCL_BUILTIN_NEARBYINTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float nearbyintf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_NEARBYINTF)
+  return _CCCL_BUILTIN_NEARBYINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_NEARBYINTF ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTF vvv
+  return ::nearbyintf(__x);
+#endif // !_CCCL_BUILTIN_NEARBYINTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double nearbyint(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_NEARBYINT)
+  return _CCCL_BUILTIN_NEARBYINT(__x);
+#else // ^^^ _CCCL_BUILTIN_NEARBYINT ^^^ // vvv !_CCCL_BUILTIN_NEARBYINT vvv
+  return ::nearbyint(__x);
+#endif // !_CCCL_BUILTIN_NEARBYINT
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double nearbyint(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEARBYINTL)
+  return _CCCL_BUILTIN_NEARBYINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_NEARBYINTL ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTL vvv
+  return ::nearbyintl(__x);
+#  endif // !_CCCL_BUILTIN_NEARBYINTL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double nearbyintl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEARBYINTL)
+  return _CCCL_BUILTIN_NEARBYINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_NEARBYINTL ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTL vvv
+  return ::nearbyintl(__x);
+#  endif // !_CCCL_BUILTIN_NEARBYINTL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half nearbyint(__half __x) noexcept
+{
+  return __float2half(_CUDA_VSTD::nearbyintf(__half2float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 nearbyint(__nv_bfloat16 __x) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::nearbyintf(__bfloat162float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double nearbyint(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::nearbyint((double) __x);
+}
+
+// nextafter
+
+_LIBCUDACXX_HIDE_FROM_ABI float nextafter(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_NEXTAFTERF)
+  return _CCCL_BUILTIN_NEXTAFTERF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_NEXTAFTERF ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERF vvv
+  return ::nextafterf(__x, __y);
+#endif // !_CCCL_BUILTIN_NEXTAFTERF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI float nextafterf(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_NEXTAFTERF)
+  return _CCCL_BUILTIN_NEXTAFTERF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_NEXTAFTERF ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERF vvv
+  return ::nextafterf(__x, __y);
+#endif // !_CCCL_BUILTIN_NEXTAFTERF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI double nextafter(double __x, double __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_NEXTAFTER)
+  return _CCCL_BUILTIN_NEXTAFTER(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_NEXTAFTER ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTER vvv
+  return ::nextafter(__x, __y);
+#endif // !_CCCL_BUILTIN_NEXTAFTER
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_LIBCUDACXX_HIDE_FROM_ABI long double nextafter(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEXTAFTERL)
+  return _CCCL_BUILTIN_NEXTAFTERL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_NEXTAFTERL ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERL vvv
+  return ::nextafterl(__x, __y);
+#  endif // !_CCCL_BUILTIN_NEXTAFTERL
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long double nextafterl(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEXTAFTERL)
+  return _CCCL_BUILTIN_NEXTAFTERL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_NEXTAFTERL ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERL vvv
+  return ::nextafterl(__x, __y);
+#  endif // !_CCCL_BUILTIN_NEXTAFTERL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half nextafter(__half __x, __half __y) noexcept
+{
+  return __float2half(_CUDA_VSTD::nextafterf(__half2float(__x), __half2float(__y)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 nextafter(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::nextafterf(__bfloat162float(__x), __bfloat162float(__y)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _A1, class _A2, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1) && _CCCL_TRAIT(is_arithmetic, _A2), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> nextafter(_A1 __x, _A2 __y) noexcept
+{
+  using __result_type = __promote_t<_A1, _A2>;
+  static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), "");
+  return _CUDA_VSTD::nextafter(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
+}
+
+// nexttoward
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_LIBCUDACXX_HIDE_FROM_ABI float nexttoward(float __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEXTTOWARDF)
+  return _CCCL_BUILTIN_NEXTTOWARDF(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_NEXTTOWARDF ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDF vvv
+  return ::nexttowardf(__x, __y);
+#  endif // !_CCCL_BUILTIN_NEXTTOWARDF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI float nexttowardf(float __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEXTTOWARDF)
+  return _CCCL_BUILTIN_NEXTTOWARDF(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_NEXTTOWARDF ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDF vvv
+  return ::nexttowardf(__x, __y);
+#  endif // !_CCCL_BUILTIN_NEXTTOWARDF
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI double nexttoward(double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEXTTOWARD)
+  return _CCCL_BUILTIN_NEXTTOWARD(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_NEXTTOWARD ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARD vvv
+  return ::nexttoward(__x, __y);
+#  endif // !_CCCL_BUILTIN_NEXTTOWARD
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long double nexttoward(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEXTTOWARDL)
+  return _CCCL_BUILTIN_NEXTTOWARDL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_NEXTTOWARDL ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDL vvv
+  return ::nexttowardl(__x, __y);
+#  endif // !_CCCL_BUILTIN_NEXTTOWARDL
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI long double nexttowardl(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_NEXTTOWARDL)
+  return _CCCL_BUILTIN_NEXTTOWARDL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_NEXTTOWARDL ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDL vvv
+  return ::nexttowardl(__x, __y);
+#  endif // !_CCCL_BUILTIN_NEXTTOWARDL
+}
+
+#  if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half nexttoward(__half __x, long double __y) noexcept
+{
+  return __float2half(_CUDA_VSTD::nexttowardf(__half2float(__x), __y));
+}
+#  endif // _LIBCUDACXX_HAS_NVFP16
+
+#  if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 nexttoward(__nv_bfloat16 __x, long double __y) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::nexttowardf(__bfloat162float(__x), __y));
+}
+#  endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI double nexttoward(_Integer __x, long double __y) noexcept
+{
+  return _CUDA_VSTD::nexttoward((double) __x, __y);
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+// rint
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float rint(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_RINTF)
+  return _CCCL_BUILTIN_RINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_RINTF ^^^ // vvv !_CCCL_BUILTIN_RINTF vvv
+  return ::rintf(__x);
+#endif // !_CCCL_BUILTIN_RINTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float rintf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_RINTF)
+  return _CCCL_BUILTIN_RINTF(__x);
+#else // ^^^ _CCCL_BUILTIN_RINTF ^^^ // vvv !_CCCL_BUILTIN_RINTF vvv
+  return ::rintf(__x);
+#endif // !_CCCL_BUILTIN_RINTF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double rint(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_RINT)
+  return _CCCL_BUILTIN_RINT(__x);
+#else // ^^^ _CCCL_BUILTIN_RINT ^^^ // vvv !_CCCL_BUILTIN_RINT vvv
+  return ::rint(__x);
+#endif // !_CCCL_BUILTIN_RINT
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double rint(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_RINTL)
+  return _CCCL_BUILTIN_RINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_RINTL ^^^ // vvv !_CCCL_BUILTIN_RINTL vvv
+  return ::rintl(__x);
+#  endif // !_CCCL_BUILTIN_RINTL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double rintl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_RINTL)
+  return _CCCL_BUILTIN_RINTL(__x);
+#  else // ^^^ _CCCL_BUILTIN_RINTL ^^^ // vvv !_CCCL_BUILTIN_RINTL vvv
+  return ::rintl(__x);
+#  endif // !_CCCL_BUILTIN_RINTL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half rint(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hrint(__x);), (return __float2half(_CUDA_VSTD::rint(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 rint(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hrint(__x);), (return __float2bfloat16(_CUDA_VSTD::rint(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double rint(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::rint((double) __x);
+}
+
+// round
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float round(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ROUNDF)
+  return _CCCL_BUILTIN_ROUNDF(__x);
+#else // ^^^ _CCCL_BUILTIN_ROUNDF ^^^ // vvv !_CCCL_BUILTIN_ROUNDF vvv
+  return ::roundf(__x);
+#endif // !_CCCL_BUILTIN_ROUNDF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float roundf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ROUNDF)
+  return _CCCL_BUILTIN_ROUNDF(__x);
+#else // ^^^ _CCCL_BUILTIN_ROUNDF ^^^ // vvv !_CCCL_BUILTIN_ROUNDF vvv
+  return ::roundf(__x);
+#endif // !_CCCL_BUILTIN_ROUNDF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double round(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ROUND)
+  return _CCCL_BUILTIN_ROUND(__x);
+#else // ^^^ _CCCL_BUILTIN_ROUND ^^^ // vvv !_CCCL_BUILTIN_ROUND vvv
+  return ::round(__x);
+#endif // !_CCCL_BUILTIN_ROUND
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double round(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_ROUNDL)
+  return _CCCL_BUILTIN_ROUNDL(__x);
+#  else // ^^^ _CCCL_BUILTIN_ROUNDL ^^^ // vvv !_CCCL_BUILTIN_ROUNDL vvv
+  return ::roundl(__x);
+#  endif // !_CCCL_BUILTIN_ROUNDL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double roundl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_ROUNDL)
+  return _CCCL_BUILTIN_ROUNDL(__x);
+#  else // ^^^ _CCCL_BUILTIN_ROUNDL ^^^ // vvv !_CCCL_BUILTIN_ROUNDL vvv
+  return ::roundl(__x);
+#  endif // !_CCCL_BUILTIN_ROUNDL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half round(__half __x) noexcept
+{
+  return __float2half(_CUDA_VSTD::roundf(__half2float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 round(__nv_bfloat16 __x) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::roundf(__bfloat162float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double round(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::round((double) __x);
+}
+
+// trunc
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float trunc(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_TRUNCF)
+  return _CCCL_BUILTIN_TRUNCF(__x);
+#else // ^^^ _CCCL_BUILTIN_TRUNCF ^^^ // vvv !_CCCL_BUILTIN_TRUNCF vvv
+  return ::truncf(__x);
+#endif // !_CCCL_BUILTIN_TRUNCF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float truncf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_TRUNCF)
+  return _CCCL_BUILTIN_TRUNCF(__x);
+#else // ^^^ _CCCL_BUILTIN_TRUNCF ^^^ // vvv !_CCCL_BUILTIN_TRUNCF vvv
+  return ::truncf(__x);
+#endif // !_CCCL_BUILTIN_TRUNCF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double trunc(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_TRUNC)
+  return _CCCL_BUILTIN_TRUNC(__x);
+#else // ^^^ _CCCL_BUILTIN_TRUNC ^^^ // vvv !_CCCL_BUILTIN_TRUNC vvv
+  return ::trunc(__x);
+#endif // !_CCCL_BUILTIN_TRUNC
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double trunc(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_TRUNCL)
+  return _CCCL_BUILTIN_TRUNCL(__x);
+#  else // ^^^ _CCCL_BUILTIN_TRUNCL ^^^ // vvv !_CCCL_BUILTIN_TRUNCL vvv
+  return ::truncl(__x);
+#  endif // !_CCCL_BUILTIN_TRUNCL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double truncl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_TRUNCL)
+  return _CCCL_BUILTIN_TRUNCL(__x);
+#  else // ^^^ _CCCL_BUILTIN_TRUNCL ^^^ // vvv !_CCCL_BUILTIN_TRUNCL vvv
+  return ::truncl(__x);
+#  endif // !_CCCL_BUILTIN_TRUNCL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half trunc(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::htrunc(__x);), (return __float2half(_CUDA_VSTD::trunc(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 trunc(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::htrunc(__x);), (return __float2bfloat16(_CUDA_VSTD::trunc(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double trunc(_Integer __x) noexcept
+{
+  return _CUDA_VSTD::trunc((double) __x);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___CMATH_ROUNDING_FUNCTIONS_H
diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h
index 1282b47f6d9..99fcde51002 100644
--- a/libcudacxx/include/cuda/std/__complex/nvbf16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h
@@ -85,8 +85,8 @@ struct __type_to_vector<__nv_bfloat16>
 template <>
 struct __cccl_complex_overload_traits<__nv_bfloat16, false, false>
 {
-  typedef __nv_bfloat16 _ValueType;
-  typedef complex<__nv_bfloat16> _ComplexType;
+  using _ValueType   = __nv_bfloat16;
+  using _ComplexType = complex<__nv_bfloat16>;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h
index bc2da05d61d..7e51a81d8cb 100644
--- a/libcudacxx/include/cuda/std/__complex/nvfp16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h
@@ -82,8 +82,8 @@ struct __type_to_vector<__half>
 template <>
 struct __cccl_complex_overload_traits<__half, false, false>
 {
-  typedef __half _ValueType;
-  typedef complex<__half> _ComplexType;
+  using _ValueType   = __half;
+  using _ComplexType = complex<__half>;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__cuda/cstdint_prelude.h b/libcudacxx/include/cuda/std/__cuda/cstdint_prelude.h
deleted file mode 100644
index 5111e9dd82a..00000000000
--- a/libcudacxx/include/cuda/std/__cuda/cstdint_prelude.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX___CUDA_CSTDINT_PRELUDE_H
-#define _LIBCUDACXX___CUDA_CSTDINT_PRELUDE_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if !_CCCL_COMPILER(NVRTC)
-#  include <cstdint>
-#else // ^^^ !_CCCL_COMPILER(NVRTC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
-typedef signed short int16_t;
-typedef unsigned short uint16_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef signed long long int64_t;
-typedef unsigned long long uint64_t;
-
-#  define _LIBCUDACXX_ADDITIONAL_INTS(N)  \
-    typedef int##N##_t int_fast##N##_t;   \
-    typedef uint##N##_t uint_fast##N##_t; \
-    typedef int##N##_t int_least##N##_t;  \
-    typedef uint##N##_t uint_least##N##_t
-
-_LIBCUDACXX_ADDITIONAL_INTS(8);
-_LIBCUDACXX_ADDITIONAL_INTS(16);
-_LIBCUDACXX_ADDITIONAL_INTS(32);
-_LIBCUDACXX_ADDITIONAL_INTS(64);
-#  undef _LIBCUDACXX_ADDITIONAL_INTS
-
-typedef int64_t intptr_t;
-typedef uint64_t uintptr_t;
-typedef int64_t intmax_t;
-typedef uint64_t uintmax_t;
-
-#  define INT8_MIN        SCHAR_MIN
-#  define INT16_MIN       SHRT_MIN
-#  define INT32_MIN       INT_MIN
-#  define INT64_MIN       LLONG_MIN
-#  define INT8_MAX        SCHAR_MAX
-#  define INT16_MAX       SHRT_MAX
-#  define INT32_MAX       INT_MAX
-#  define INT64_MAX       LLONG_MAX
-#  define UINT8_MAX       UCHAR_MAX
-#  define UINT16_MAX      USHRT_MAX
-#  define UINT32_MAX      UINT_MAX
-#  define UINT64_MAX      ULLONG_MAX
-#  define INT_FAST8_MIN   SCHAR_MIN
-#  define INT_FAST16_MIN  SHRT_MIN
-#  define INT_FAST32_MIN  INT_MIN
-#  define INT_FAST64_MIN  LLONG_MIN
-#  define INT_FAST8_MAX   SCHAR_MAX
-#  define INT_FAST16_MAX  SHRT_MAX
-#  define INT_FAST32_MAX  INT_MAX
-#  define INT_FAST64_MAX  LLONG_MAX
-#  define UINT_FAST8_MAX  UCHAR_MAX
-#  define UINT_FAST16_MAX USHRT_MAX
-#  define UINT_FAST32_MAX UINT_MAX
-#  define UINT_FAST64_MAX ULLONG_MAX
-
-#  define INT8_C(X)    ((int_least8_t) (X))
-#  define INT16_C(X)   ((int_least16_t) (X))
-#  define INT32_C(X)   ((int_least32_t) (X))
-#  define INT64_C(X)   ((int_least64_t) (X))
-#  define UINT8_C(X)   ((uint_least8_t) (X))
-#  define UINT16_C(X)  ((uint_least16_t) (X))
-#  define UINT32_C(X)  ((uint_least32_t) (X))
-#  define UINT64_C(X)  ((uint_least64_t) (X))
-#  define INTMAX_C(X)  ((intmax_t) (X))
-#  define UINTMAX_C(X) ((uintmax_t) (X))
-#endif // _CCCL_COMPILER(NVRTC)
-
-#endif // _LIBCUDACXX___CUDA_CSTDINT_PRELUDE_H
diff --git a/libcudacxx/include/cuda/std/__exception/cuda_error.h b/libcudacxx/include/cuda/std/__exception/cuda_error.h
index 40af7d6c3e6..fdc32cf0571 100644
--- a/libcudacxx/include/cuda/std/__exception/cuda_error.h
+++ b/libcudacxx/include/cuda/std/__exception/cuda_error.h
@@ -22,10 +22,6 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CUDA_COMPILER(CLANG)
-#  include <cuda_runtime_api.h>
-#endif // _CCCL_CUDA_COMPILER(CLANG)
-
 #include <cuda/std/__exception/terminate.h>
 
 #if !_CCCL_COMPILER(NVRTC)
@@ -40,8 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 /**
  * @brief Exception thrown when a CUDA error is encountered.
  */
-#if _CCCL_HAS_CUDA_COMPILER
-#  ifndef _CCCL_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 class cuda_error : public ::std::runtime_error
 {
 private:
@@ -50,37 +45,36 @@ class cuda_error : public ::std::runtime_error
     char __buffer[256];
   };
 
-  static char* __format_cuda_error(::cudaError_t __status, const char* __msg, char* __msg_buffer) noexcept
+  static char* __format_cuda_error(const int __status, const char* __msg, char* __msg_buffer) noexcept
   {
     ::snprintf(__msg_buffer, 256, "cudaError %d: %s", __status, __msg);
     return __msg_buffer;
   }
 
 public:
-  cuda_error(::cudaError_t __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept
+  cuda_error(const int __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept
       : ::std::runtime_error(__format_cuda_error(__status, __msg, __msg_buffer.__buffer))
   {}
 };
 
-_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t __status, const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int __status, const char* __msg)
 {
   NV_IF_ELSE_TARGET(NV_IS_HOST,
                     (throw ::cuda::cuda_error(__status, __msg);),
                     ((void) __status; (void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
 }
-#  else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
 class cuda_error
 {
 public:
-  _LIBCUDACXX_HIDE_FROM_ABI cuda_error(::cudaError_t, const char*) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI cuda_error(const int, const char*) noexcept {}
 };
 
-_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t, const char*)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int, const char*)
 {
   _CUDA_VSTD_NOVERSION::terminate();
 }
-#  endif // _CCCL_NO_EXCEPTIONS
-#endif // _CCCL_CUDA_COMPILER
+#endif // _CCCL_NO_EXCEPTIONS
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
diff --git a/libcudacxx/include/cuda/std/__exception/terminate.h b/libcudacxx/include/cuda/std/__exception/terminate.h
index a65722bac64..c5dd9a7e6cb 100644
--- a/libcudacxx/include/cuda/std/__exception/terminate.h
+++ b/libcudacxx/include/cuda/std/__exception/terminate.h
@@ -37,7 +37,7 @@ _CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __cccl_terminate() noexcept
 
 #if 0 // Expose once atomic is universally available
 
-typedef void (*terminate_handler)();
+using terminate_handler = void (*)();
 
 #  ifdef __CUDA_ARCH__
 __device__
diff --git a/libcudacxx/include/cuda/std/__functional/binary_function.h b/libcudacxx/include/cuda/std/__functional/binary_function.h
index af7230678b7..5b400088e4c 100644
--- a/libcudacxx/include/cuda/std/__functional/binary_function.h
+++ b/libcudacxx/include/cuda/std/__functional/binary_function.h
@@ -27,9 +27,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Arg1, class _Arg2, class _Result>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 binary_function
 {
-  typedef _Arg1 first_argument_type;
-  typedef _Arg2 second_argument_type;
-  typedef _Result result_type;
+  using first_argument_type  = _Arg1;
+  using second_argument_type = _Arg2;
+  using result_type          = _Result;
 };
 
 #endif // _CCCL_STD_VER <= 2014 || defined(_LIBCUDACXX_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION)
diff --git a/libcudacxx/include/cuda/std/__functional/bind.h b/libcudacxx/include/cuda/std/__functional/bind.h
index 0c1beac45c9..7542191c0d8 100644
--- a/libcudacxx/include/cuda/std/__functional/bind.h
+++ b/libcudacxx/include/cuda/std/__functional/bind.h
@@ -122,7 +122,7 @@ template <class _Ti, class... _Uj>
 _LIBCUDACXX_HIDE_FROM_ABI enable_if_t<is_bind_expression<_Ti>::value, __invoke_of<_Ti&, _Uj...>>
 __mu(_Ti& __ti, tuple<_Uj...>& __uj)
 {
-  typedef __make_tuple_indices_t<sizeof...(_Uj)> __indices;
+  using __indices = __make_tuple_indices_t<sizeof...(_Uj)>;
   return _CUDA_VSTD::__mu_expand(__ti, __uj, __indices());
 }
 
@@ -133,7 +133,7 @@ struct __mu_return2
 template <class _Ti, class _Uj>
 struct __mu_return2<true, _Ti, _Uj>
 {
-  typedef __tuple_element_t<is_placeholder<_Ti>::value - 1, _Uj> type;
+  using type = __tuple_element_t<is_placeholder<_Ti>::value - 1, _Uj>;
 };
 
 template <class _Ti, class _Uj>
@@ -160,13 +160,13 @@ struct __mu_return_impl;
 template <bool _Invokable, class _Ti, class... _Uj>
 struct __mu_return_invokable // false
 {
-  typedef __nat type;
+  using type = __nat;
 };
 
 template <class _Ti, class... _Uj>
 struct __mu_return_invokable<true, _Ti, _Uj...>
 {
-  typedef typename __invoke_of<_Ti&, _Uj...>::type type;
+  using type = typename __invoke_of<_Ti&, _Uj...>::type;
 };
 
 template <class _Ti, class... _Uj>
@@ -177,19 +177,19 @@ struct __mu_return_impl<_Ti, false, true, false, tuple<_Uj...>>
 template <class _Ti, class _TupleUj>
 struct __mu_return_impl<_Ti, false, false, true, _TupleUj>
 {
-  typedef __tuple_element_t<is_placeholder<_Ti>::value - 1, _TupleUj>&& type;
+  using type = __tuple_element_t<is_placeholder<_Ti>::value - 1, _TupleUj>&&;
 };
 
 template <class _Ti, class _TupleUj>
 struct __mu_return_impl<_Ti, true, false, false, _TupleUj>
 {
-  typedef typename _Ti::type& type;
+  using type = typename _Ti::type&;
 };
 
 template <class _Ti, class _TupleUj>
 struct __mu_return_impl<_Ti, false, false, false, _TupleUj>
 {
-  typedef _Ti& type;
+  using type = _Ti&;
 };
 
 template <class _Ti, class _TupleUj>
@@ -226,13 +226,13 @@ struct __bind_return;
 template <class _Fp, class... _BoundArgs, class _TupleUj>
 struct __bind_return<_Fp, tuple<_BoundArgs...>, _TupleUj, true>
 {
-  typedef typename __invoke_of<_Fp&, typename __mu_return<_BoundArgs, _TupleUj>::type...>::type type;
+  using type = typename __invoke_of<_Fp&, typename __mu_return<_BoundArgs, _TupleUj>::type...>::type;
 };
 
 template <class _Fp, class... _BoundArgs, class _TupleUj>
 struct __bind_return<_Fp, const tuple<_BoundArgs...>, _TupleUj, true>
 {
-  typedef typename __invoke_of<_Fp&, typename __mu_return<const _BoundArgs, _TupleUj>::type...>::type type;
+  using type = typename __invoke_of<_Fp&, typename __mu_return<const _BoundArgs, _TupleUj>::type...>::type;
 };
 
 template <class _Fp, class _BoundArgs, class _TupleUj>
@@ -249,14 +249,14 @@ template <class _Fp, class... _BoundArgs>
 class __bind : public __weak_result_type<decay_t<_Fp>>
 {
 protected:
-  typedef decay_t<_Fp> _Fd;
-  typedef tuple<decay_t<_BoundArgs>...> _Td;
+  using _Fd = decay_t<_Fp>;
+  using _Td = tuple<decay_t<_BoundArgs>...>;
 
 private:
   _Fd __f_;
   _Td __bound_args_;
 
-  typedef __make_tuple_indices_t<sizeof...(_BoundArgs)> __indices;
+  using __indices = __make_tuple_indices_t<sizeof...(_BoundArgs)>;
 
 public:
   template <class _Gp,
@@ -291,12 +291,12 @@ struct is_bind_expression<__bind<_Fp, _BoundArgs...>> : public true_type
 template <class _Rp, class _Fp, class... _BoundArgs>
 class __bind_r : public __bind<_Fp, _BoundArgs...>
 {
-  typedef __bind<_Fp, _BoundArgs...> base;
-  typedef typename base::_Fd _Fd;
-  typedef typename base::_Td _Td;
+  using base = __bind<_Fp, _BoundArgs...>;
+  using _Fd  = typename base::_Fd;
+  using _Td  = typename base::_Td;
 
 public:
-  typedef _Rp result_type;
+  using result_type = _Rp;
 
   template <class _Gp,
             class... _BA,
@@ -311,7 +311,7 @@ class __bind_r : public __bind<_Fp, _BoundArgs...>
               result_type>
   operator()(_Args&&... __args)
   {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
+    using _Invoker = __invoke_void_return_wrapper<_Rp>;
     return _Invoker::__call(static_cast<base&>(*this), _CUDA_VSTD::forward<_Args>(__args)...);
   }
 
@@ -321,7 +321,7 @@ class __bind_r : public __bind<_Fp, _BoundArgs...>
     result_type>
   operator()(_Args&&... __args) const
   {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
+    using _Invoker = __invoke_void_return_wrapper<_Rp>;
     return _Invoker::__call(static_cast<base const&>(*this), _CUDA_VSTD::forward<_Args>(__args)...);
   }
 };
@@ -333,7 +333,7 @@ struct is_bind_expression<__bind_r<_Rp, _Fp, _BoundArgs...>> : public true_type
 template <class _Fp, class... _BoundArgs>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind<_Fp, _BoundArgs...> bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 {
-  typedef __bind<_Fp, _BoundArgs...> type;
+  using type = __bind<_Fp, _BoundArgs...>;
   return type(_CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_BoundArgs>(__bound_args)...);
 }
 
@@ -341,7 +341,7 @@ template <class _Rp, class _Fp, class... _BoundArgs>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind_r<_Rp, _Fp, _BoundArgs...>
 bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 {
-  typedef __bind_r<_Rp, _Fp, _BoundArgs...> type;
+  using type = __bind_r<_Rp, _Fp, _BoundArgs...>;
   return type(_CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_BoundArgs>(__bound_args)...);
 }
 
diff --git a/libcudacxx/include/cuda/std/__functional/function.h b/libcudacxx/include/cuda/std/__functional/function.h
index e2ec912e6fb..6544f572c81 100644
--- a/libcudacxx/include/cuda/std/__functional/function.h
+++ b/libcudacxx/include/cuda/std/__functional/function.h
@@ -136,8 +136,8 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)>
   __compressed_pair<_Fp, _Ap> __f_;
 
 public:
-  typedef _CCCL_NODEBUG_ALIAS _Fp _Target;
-  typedef _CCCL_NODEBUG_ALIAS _Ap _Alloc;
+  using _Target _CCCL_NODEBUG_ALIAS = _Fp;
+  using _Alloc _CCCL_NODEBUG_ALIAS  = _Ap;
 
   _LIBCUDACXX_HIDE_FROM_ABI const _Target& __target() const
   {
@@ -170,16 +170,16 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)>
 
   _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg)
   {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
+    using _Invoker = __invoke_void_return_wrapper<_Rp>;
     return _Invoker::__call(__f_.first(), _CUDA_VSTD::forward<_ArgTypes>(__arg)...);
   }
 
   _LIBCUDACXX_HIDE_FROM_ABI __alloc_func* __clone() const
   {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type _AA;
+    using __alloc_traits = allocator_traits<_Alloc>;
+    using _AA            = typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type;
     _AA __a(__f_.second());
-    typedef __allocator_destructor<_AA> _Dp;
+    using _Dp = __allocator_destructor<_AA>;
     unique_ptr<__alloc_func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
     ::new ((void*) __hold.get()) __alloc_func(__f_.first(), _Alloc(__a));
     return __hold.release();
@@ -192,8 +192,8 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)>
 
   static void __destroy_and_delete(__alloc_func* __f)
   {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type _FunAlloc;
+    using __alloc_traits = allocator_traits<_Alloc>;
+    using _FunAlloc      = typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type;
     _FunAlloc __a(__f->__get_allocator());
     __f->destroy();
     __a.deallocate(__f, 1);
@@ -206,7 +206,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)>
   _Fp __f_;
 
 public:
-  typedef _CCCL_NODEBUG_ALIAS _Fp _Target;
+  using _Target _CCCL_NODEBUG_ALIAS = _Fp;
 
   _LIBCUDACXX_HIDE_FROM_ABI const _Target& __target() const
   {
@@ -223,7 +223,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)>
 
   _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg)
   {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
+    using _Invoker = __invoke_void_return_wrapper<_Rp>;
     return _Invoker::__call(__f_, _CUDA_VSTD::forward<_ArgTypes>(__arg)...);
   }
 
@@ -313,10 +313,10 @@ class __func<_Fp, _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)>
 template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
 __base<_Rp(_ArgTypes...)>* __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__clone() const
 {
-  typedef allocator_traits<_Alloc> __alloc_traits;
-  typedef typename __rebind_alloc_helper<__alloc_traits, __func>::type _Ap;
+  using __alloc_traits = allocator_traits<_Alloc>;
+  using _Ap            = typename __rebind_alloc_helper<__alloc_traits, __func>::type;
   _Ap __a(__f_.__get_allocator());
-  typedef __allocator_destructor<_Ap> _Dp;
+  using _Dp = __allocator_destructor<_Ap>;
   unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
   ::new ((void*) __hold.get()) __func(__f_.__target(), _Alloc(__a));
   return __hold.release();
@@ -337,8 +337,8 @@ void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy() noexcept
 template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
 void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy_deallocate() noexcept
 {
-  typedef allocator_traits<_Alloc> __alloc_traits;
-  typedef typename __rebind_alloc_helper<__alloc_traits, __func>::type _Ap;
+  using __alloc_traits = allocator_traits<_Alloc>;
+  using _Ap            = typename __rebind_alloc_helper<__alloc_traits, __func>::type;
   _Ap __a(__f_.__get_allocator());
   __f_.destroy();
   __a.deallocate(this, 1);
@@ -380,7 +380,7 @@ class __value_func<_Rp(_ArgTypes...)>
 {
   typename aligned_storage<3 * sizeof(void*)>::type __buf_;
 
-  typedef __base<_Rp(_ArgTypes...)> __func;
+  using __func = __base<_Rp(_ArgTypes...)>;
   __func* __f_;
 
   _LIBCUDACXX_NO_CFI static __func* __as_base(void* __p)
@@ -397,9 +397,9 @@ class __value_func<_Rp(_ArgTypes...)>
   _LIBCUDACXX_HIDE_FROM_ABI __value_func(_Fp&& __f, const _Alloc& __a)
       : __f_(nullptr)
   {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __function::__func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
-    typedef typename __rebind_alloc_helper<__alloc_traits, _Fun>::type _FunAlloc;
+    using __alloc_traits = allocator_traits<_Alloc>;
+    using _Fun           = __function::__func<_Fp, _Alloc, _Rp(_ArgTypes...)>;
+    using _FunAlloc      = typename __rebind_alloc_helper<__alloc_traits, _Fun>::type;
 
     if (__function::__not_null(__f))
     {
@@ -411,7 +411,7 @@ class __value_func<_Rp(_ArgTypes...)>
       }
       else
       {
-        typedef __allocator_destructor<_FunAlloc> _Dp;
+        using _Dp = __allocator_destructor<_FunAlloc>;
         unique_ptr<__func, _Dp> __hold(__af.allocate(1), _Dp(__af, 1));
         ::new ((void*) __hold.get()) _Fun(_CUDA_VSTD::move(__f), _Alloc(__a));
         __f_ = __hold.release();
@@ -697,7 +697,7 @@ struct __policy_invoker;
 template <class _Rp, class... _ArgTypes>
 struct __policy_invoker<_Rp(_ArgTypes...)>
 {
-  typedef _Rp (*__Call)(const __policy_storage*, __fast_forward<_ArgTypes>...);
+  using __Call = _Rp (*)(const __policy_storage*, __fast_forward<_ArgTypes>...);
 
   __Call __call_;
 
@@ -746,7 +746,7 @@ class __policy_func<_Rp(_ArgTypes...)>
   // Calls the value stored in __buf_. This could technically be part of
   // policy, but storing it here eliminates a level of indirection inside
   // operator().
-  typedef __function::__policy_invoker<_Rp(_ArgTypes...)> __invoker;
+  using __invoker = __function::__policy_invoker<_Rp(_ArgTypes...)>;
   __invoker __invoker_;
 
   // The policy that describes how to move / copy / destroy __buf_. Never
@@ -762,9 +762,9 @@ class __policy_func<_Rp(_ArgTypes...)>
   _LIBCUDACXX_HIDE_FROM_ABI __policy_func(_Fp&& __f, const _Alloc& __a)
       : __policy_(__policy::__create_empty())
   {
-    typedef __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef typename __rebind_alloc_helper<__alloc_traits, _Fun>::type _FunAlloc;
+    using _Fun           = __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)>;
+    using __alloc_traits = allocator_traits<_Alloc>;
+    using _FunAlloc      = typename __rebind_alloc_helper<__alloc_traits, _Fun>::type;
 
     if (__function::__not_null(__f))
     {
@@ -778,7 +778,7 @@ class __policy_func<_Rp(_ArgTypes...)>
       }
       else
       {
-        typedef __allocator_destructor<_FunAlloc> _Dp;
+        using _Dp = __allocator_destructor<_FunAlloc>;
         unique_ptr<_Fun, _Dp> __hold(__af.allocate(1), _Dp(__af, 1));
         ::new ((void*) __hold.get()) _Fun(_CUDA_VSTD::move(__f), _Alloc(__af));
         __buf_.__large = __hold.release();
@@ -790,7 +790,7 @@ class __policy_func<_Rp(_ArgTypes...)>
   _LIBCUDACXX_HIDE_FROM_ABI explicit __policy_func(_Fp&& __f)
       : __policy_(__policy::__create_empty())
   {
-    typedef __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _Fun;
+    using _Fun = __default_alloc_func<_Fp, _Rp(_ArgTypes...)>;
 
     if (__function::__not_null(__f))
     {
@@ -913,7 +913,7 @@ extern "C" void _Block_release(const void*);
 template <class _Rp1, class... _ArgTypes1, class _Alloc, class _Rp, class... _ArgTypes>
 class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)>
 {
-  typedef _Rp1 (^__block_type)(_ArgTypes1...);
+  using ...); = _Rp1 (^__block_type)(_ArgTypes1
   __block_type __f_;
 
 public:
@@ -989,7 +989,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT function<_Rp(_ArgTypes...)>
     : public __function::__maybe_derive_from_unary_function<_Rp(_ArgTypes...)>
     , public __function::__maybe_derive_from_binary_function<_Rp(_ArgTypes...)>
 {
-  typedef __function::__policy_func<_Rp(_ArgTypes...)> __func;
+  using __func = __function::__policy_func<_Rp(_ArgTypes...)>;
 
   __func __f_;
 
@@ -1011,7 +1011,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT function<_Rp(_ArgTypes...)>
   using _EnableIfLValueCallable = enable_if_t<__callable<_Fp&>::value>;
 
 public:
-  typedef _Rp result_type;
+  using result_type = _Rp;
 
   // construct/copy/destroy:
   _LIBCUDACXX_HIDE_FROM_ABI function() noexcept {}
diff --git a/libcudacxx/include/cuda/std/__functional/hash.h b/libcudacxx/include/cuda/std/__functional/hash.h
index 5e3559663c0..d40f393e9b1 100644
--- a/libcudacxx/include/cuda/std/__functional/hash.h
+++ b/libcudacxx/include/cuda/std/__functional/hash.h
@@ -35,7 +35,7 @@
 #include <cuda/std/__utility/pair.h>
 #include <cuda/std/__utility/swap.h>
 #include <cuda/std/cstdint>
-#include <cuda/std/detail/libcxx/include/cstring>
+#include <cuda/std/cstring>
 
 #ifndef __cuda_std__
 
@@ -45,7 +45,7 @@ template <class _Size>
 _LIBCUDACXX_HIDE_FROM_ABI _Size __loadword(const void* __p)
 {
   _Size __r;
-  std::memcpy(&__r, __p, sizeof(__r));
+  _CUDA_VSTD::memcpy(&__r, __p, sizeof(__r));
   return __r;
 }
 
@@ -374,7 +374,7 @@ struct _PairT
 
 _LIBCUDACXX_HIDE_FROM_ABI size_t __hash_combine(size_t __lhs, size_t __rhs) noexcept
 {
-  typedef __scalar_hash<_PairT> _HashT;
+  using _HashT     = __scalar_hash<_PairT>;
   const _PairT __p = {__lhs, __rhs};
   return _HashT()(__p);
 }
@@ -618,7 +618,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __enum_hash : public __unary_function<_Tp,
 {
   _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept
   {
-    typedef typename underlying_type<_Tp>::type type;
+    using type = typename underlying_type<_Tp>::type;
     return hash<type>()(static_cast<type>(__v));
   }
 };
diff --git a/libcudacxx/include/cuda/std/__functional/invoke.h b/libcudacxx/include/cuda/std/__functional/invoke.h
index f3072249fb8..e60e3b0b363 100644
--- a/libcudacxx/include/cuda/std/__functional/invoke.h
+++ b/libcudacxx/include/cuda/std/__functional/invoke.h
@@ -56,200 +56,200 @@ struct __member_pointer_traits_imp
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...), true, false>
 {
-  typedef _Class _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...), true, false>
 {
-  typedef _Class _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const, true, false>
 {
-  typedef _Class const _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class const;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const, true, false>
 {
-  typedef _Class const _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class const;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile, true, false>
 {
-  typedef _Class volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class volatile;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile, true, false>
 {
-  typedef _Class volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class volatile;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile, true, false>
 {
-  typedef _Class const volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class const volatile;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile, true, false>
 {
-  typedef _Class const volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class const volatile;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...)&, true, false>
 {
-  typedef _Class& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...)&, true, false>
 {
-  typedef _Class& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const&, true, false>
 {
-  typedef _Class const& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class const&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const&, true, false>
 {
-  typedef _Class const& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class const&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile&, true, false>
 {
-  typedef _Class volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class volatile&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile&, true, false>
 {
-  typedef _Class volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class volatile&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile&, true, false>
 {
-  typedef _Class const volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class const volatile&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile&, true, false>
 {
-  typedef _Class const volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class const volatile&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...)&&, true, false>
 {
-  typedef _Class&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...)&&, true, false>
 {
-  typedef _Class&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const&&, true, false>
 {
-  typedef _Class const&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class const&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const&&, true, false>
 {
-  typedef _Class const&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class const&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile&&, true, false>
 {
-  typedef _Class volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class volatile&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile&&, true, false>
 {
-  typedef _Class volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class volatile&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile&&, true, false>
 {
-  typedef _Class const volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
+  using _ClassType  = _Class const volatile&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param...);
 };
 
 template <class _Rp, class _Class, class... _Param>
 struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile&&, true, false>
 {
-  typedef _Class const volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
+  using _ClassType  = _Class const volatile&&;
+  using _ReturnType = _Rp;
+  using _FnType     = _Rp (*)(_Param..., ...);
 };
 
 template <class _Rp, class _Class>
 struct __member_pointer_traits_imp<_Rp _Class::*, false, true>
 {
-  typedef _Class _ClassType;
-  typedef _Rp _ReturnType;
+  using _ClassType  = _Class;
+  using _ReturnType = _Rp;
 };
 
 template <class _MP>
@@ -270,7 +270,7 @@ struct __member_pointer_class_type
 template <class _Ret, class _ClassType>
 struct __member_pointer_class_type<_Ret _ClassType::*>
 {
-  typedef _ClassType type;
+  using type = _ClassType;
 };
 
 template <class _Fp,
@@ -424,7 +424,7 @@ struct __nothrow_invokable_r_imp
 template <class _Ret, class _Fp, class... _Args>
 struct __nothrow_invokable_r_imp<true, false, _Ret, _Fp, _Args...>
 {
-  typedef __nothrow_invokable_r_imp _ThisT;
+  using _ThisT = __nothrow_invokable_r_imp;
 
   template <class _Tp>
   _LIBCUDACXX_HIDE_FROM_ABI static void __test_noexcept(_Tp) noexcept;
diff --git a/libcudacxx/include/cuda/std/__functional/mem_fn.h b/libcudacxx/include/cuda/std/__functional/mem_fn.h
index 8327b4edfef..ffbf2c90822 100644
--- a/libcudacxx/include/cuda/std/__functional/mem_fn.h
+++ b/libcudacxx/include/cuda/std/__functional/mem_fn.h
@@ -33,7 +33,7 @@ class __mem_fn : public __weak_result_type<_Tp>
 {
 public:
   // types
-  typedef _Tp type;
+  using type = _Tp;
 
 private:
   type __f_;
diff --git a/libcudacxx/include/cuda/std/__functional/operations.h b/libcudacxx/include/cuda/std/__functional/operations.h
index a52a0af2840..24ced46b12f 100644
--- a/libcudacxx/include/cuda/std/__functional/operations.h
+++ b/libcudacxx/include/cuda/std/__functional/operations.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT plus : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -52,13 +52,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT plus<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) + _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT minus : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -78,13 +78,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT minus<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) - _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT multiplies : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -104,13 +104,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT multiplies<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) * _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT divides : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -130,13 +130,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT divides<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) / _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT modulus : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -156,13 +156,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT modulus<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) % _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT negate : __unary_function<_Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x) const
   {
@@ -181,7 +181,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT negate<void>
   {
     return -_CUDA_VSTD::forward<_Tp>(__x);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 // Bitwise operations
@@ -189,7 +189,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT negate<void>
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_and : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -209,7 +209,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_and<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) & _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
@@ -233,13 +233,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_not<void>
   {
     return ~_CUDA_VSTD::forward<_Tp>(__x);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_or : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -259,13 +259,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_or<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) | _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor : __binary_function<_Tp, _Tp, _Tp>
 {
-  typedef _Tp __result_type; // used by valarray
+  using __result_type = _Tp; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -285,7 +285,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) ^ _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 // Comparison operations
@@ -293,7 +293,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor<void>
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT equal_to : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -313,13 +313,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT equal_to<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) == _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT not_equal_to : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -339,13 +339,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT not_equal_to<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) != _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT less : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -365,13 +365,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT less<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) < _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT less_equal : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -391,13 +391,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT less_equal<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) <= _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT greater_equal : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -417,13 +417,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT greater_equal<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) >= _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT greater : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -443,7 +443,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT greater<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) > _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 // Logical operations
@@ -451,7 +451,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT greater<void>
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_and : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -471,13 +471,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_and<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) && _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_not : __unary_function<_Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x) const
   {
@@ -496,13 +496,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_not<void>
   {
     return !_CUDA_VSTD::forward<_Tp>(__x);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 template <class _Tp = void>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_or : __binary_function<_Tp, _Tp, bool>
 {
-  typedef bool __result_type; // used by valarray
+  using __result_type = bool; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const
   {
@@ -522,7 +522,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_or<void>
   {
     return _CUDA_VSTD::forward<_T1>(__t) || _CUDA_VSTD::forward<_T2>(__u);
   }
-  typedef void is_transparent;
+  using is_transparent = void;
 };
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__functional/reference_wrapper.h b/libcudacxx/include/cuda/std/__functional/reference_wrapper.h
index dd8070871a9..7c4b4684079 100644
--- a/libcudacxx/include/cuda/std/__functional/reference_wrapper.h
+++ b/libcudacxx/include/cuda/std/__functional/reference_wrapper.h
@@ -35,7 +35,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reference_wrapper : public __weak_result_typ
 {
 public:
   // types
-  typedef _Tp type;
+  using type = _Tp;
 
 private:
   type* __f_;
diff --git a/libcudacxx/include/cuda/std/__functional/unary_function.h b/libcudacxx/include/cuda/std/__functional/unary_function.h
index c3509753574..915bd68652b 100644
--- a/libcudacxx/include/cuda/std/__functional/unary_function.h
+++ b/libcudacxx/include/cuda/std/__functional/unary_function.h
@@ -26,8 +26,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Arg, class _Result>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 unary_function
 {
-  typedef _Arg argument_type;
-  typedef _Result result_type;
+  using argument_type = _Arg;
+  using result_type   = _Result;
 };
 
 #endif // _CCCL_STD_VER <= 2014
diff --git a/libcudacxx/include/cuda/std/__functional/unwrap_ref.h b/libcudacxx/include/cuda/std/__functional/unwrap_ref.h
index 81868eafd5b..ca99e370ddf 100644
--- a/libcudacxx/include/cuda/std/__functional/unwrap_ref.h
+++ b/libcudacxx/include/cuda/std/__functional/unwrap_ref.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct __unwrap_reference
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 
 template <class _Tp>
@@ -34,7 +34,7 @@ class reference_wrapper;
 template <class _Tp>
 struct __unwrap_reference<reference_wrapper<_Tp>>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp& type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp&;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__functional/weak_result_type.h b/libcudacxx/include/cuda/std/__functional/weak_result_type.h
index 1aff29113a3..dae84e02025 100644
--- a/libcudacxx/include/cuda/std/__functional/weak_result_type.h
+++ b/libcudacxx/include/cuda/std/__functional/weak_result_type.h
@@ -59,7 +59,7 @@ struct __derives_from_unary_function
 
 public:
   static const bool value = !is_same<decltype(__test((_Tp*) 0)), __two>::value;
-  typedef decltype(__test((_Tp*) 0)) type;
+  using type              = decltype(__test((_Tp*) 0));
 };
 
 template <class _Tp>
@@ -78,7 +78,7 @@ struct __derives_from_binary_function
 
 public:
   static const bool value = !is_same<decltype(__test((_Tp*) 0)), __two>::value;
-  typedef decltype(__test((_Tp*) 0)) type;
+  using type              = decltype(__test((_Tp*) 0));
 };
 
 template <class _Tp, bool = __derives_from_unary_function<_Tp>::value>
@@ -266,7 +266,7 @@ struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...) const volatile>
 template <class _Tp, class... _Args>
 struct __invoke_return
 {
-  typedef decltype(_CUDA_VSTD::__invoke(declval<_Tp>(), declval<_Args>()...)) type;
+  using type = decltype(_CUDA_VSTD::__invoke(declval<_Tp>(), declval<_Args>()...));
 };
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__fwd/string_view.h b/libcudacxx/include/cuda/std/__fwd/string_view.h
index 32fd502f818..0b0d9b51858 100644
--- a/libcudacxx/include/cuda/std/__fwd/string_view.h
+++ b/libcudacxx/include/cuda/std/__fwd/string_view.h
@@ -28,14 +28,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _CharT, class _Traits = char_traits<_CharT>>
 class _CCCL_TYPE_VISIBILITY_DEFAULT basic_string_view;
 
-typedef basic_string_view<char> string_view;
+using string_view = basic_string_view<char>;
 #ifndef _LIBCUDACXX_HAS_NO_CHAR8_T
-typedef basic_string_view<char8_t> u8string_view;
+using u8string_view = basic_string_view<char8_t>;
 #endif
-typedef basic_string_view<char16_t> u16string_view;
-typedef basic_string_view<char32_t> u32string_view;
+using u16string_view = basic_string_view<char16_t>;
+using u32string_view = basic_string_view<char32_t>;
 #ifndef _LIBCUDACXX_HAS_NO_WIDE_CHARACTERS
-typedef basic_string_view<wchar_t> wstring_view;
+using wstring_view = basic_string_view<wchar_t>;
 #endif
 
 // clang-format off
diff --git a/libcudacxx/include/cuda/std/__internal/cpp_dialect.h b/libcudacxx/include/cuda/std/__internal/cpp_dialect.h
index a4ea71d7ef5..2fec82a7fac 100644
--- a/libcudacxx/include/cuda/std/__internal/cpp_dialect.h
+++ b/libcudacxx/include/cuda/std/__internal/cpp_dialect.h
@@ -35,12 +35,10 @@
       future release. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
 // clang-format on
 
-#if _CCCL_STD_VER < 2011
-#  error libcu++ requires C++11 or later.
-#elif _CCCL_STD_VER == 2011 && !defined(CCCL_IGNORE_DEPRECATED_CPP_11)
-LIBCUDACXX_DIALECT_DEPRECATION(C++ 17, C++ 11)
-#elif _CCCL_STD_VER == 2014 && !defined(CCCL_IGNORE_DEPRECATED_CPP_14)
-LIBCUDACXX_DIALECT_DEPRECATION(C++ 17, C++ 14)
-#endif // _CCCL_STD_VER >= 2017
+#ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT
+#  if _CCCL_STD_VER < 2017
+#    error libcu++ requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.
+#  endif // _CCCL_STD_VER >= 2017
+#endif // CCCL_IGNORE_DEPRECATED_CPP_DIALECT
 
 #endif // _LIBCUDACXX___INTERNAL_CPP_DIALECT_H
diff --git a/libcudacxx/include/cuda/std/__iterator/advance.h b/libcudacxx/include/cuda/std/__iterator/advance.h
index 17ba093634a..48359585167 100644
--- a/libcudacxx/include/cuda/std/__iterator/advance.h
+++ b/libcudacxx/include/cuda/std/__iterator/advance.h
@@ -74,8 +74,8 @@ template <class _InputIter,
           class                   = enable_if_t<is_integral<_IntegralDistance>::value>>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void advance(_InputIter& __i, _Distance __orig_n)
 {
-  typedef typename iterator_traits<_InputIter>::difference_type _Difference;
-  _Difference __n = static_cast<_Difference>(_CUDA_VSTD::__convert_to_integral(__orig_n));
+  using _Difference = typename iterator_traits<_InputIter>::difference_type;
+  _Difference __n   = static_cast<_Difference>(_CUDA_VSTD::__convert_to_integral(__orig_n));
   _CCCL_ASSERT(__n >= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
                "Attempt to advance(it, n) with negative n on a non-bidirectional iterator");
   _CUDA_VSTD::__advance(__i, __n, typename iterator_traits<_InputIter>::iterator_category());
diff --git a/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h
index dbb8e3f8028..c551e0c7364 100644
--- a/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h
@@ -42,16 +42,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT back_insert_iterator
   _Container* container;
 
 public:
-  typedef output_iterator_tag iterator_category;
-  typedef void value_type;
+  using iterator_category = output_iterator_tag;
+  using value_type        = void;
 #if _CCCL_STD_VER > 2017
-  typedef ptrdiff_t difference_type;
+  using difference_type = ptrdiff_t;
 #else
-  typedef void difference_type;
+  using difference_type = void;
 #endif
-  typedef void pointer;
-  typedef void reference;
-  typedef _Container container_type;
+  using pointer        = void;
+  using reference      = void;
+  using container_type = _Container;
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit back_insert_iterator(_Container& __x)
       : container(_CUDA_VSTD::addressof(__x))
diff --git a/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h
index 9918441ea09..c60a65e9db3 100644
--- a/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h
@@ -42,16 +42,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT front_insert_iterator
   _Container* container;
 
 public:
-  typedef output_iterator_tag iterator_category;
-  typedef void value_type;
+  using iterator_category = output_iterator_tag;
+  using value_type        = void;
 #if _CCCL_STD_VER > 2017
-  typedef ptrdiff_t difference_type;
+  using difference_type = ptrdiff_t;
 #else
-  typedef void difference_type;
+  using difference_type = void;
 #endif
-  typedef void pointer;
-  typedef void reference;
-  typedef _Container container_type;
+  using pointer        = void;
+  using reference      = void;
+  using container_type = _Container;
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit front_insert_iterator(_Container& __x)
       : container(_CUDA_VSTD::addressof(__x))
diff --git a/libcudacxx/include/cuda/std/__iterator/insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/insert_iterator.h
index 5d63ba91847..227c4983d5b 100644
--- a/libcudacxx/include/cuda/std/__iterator/insert_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/insert_iterator.h
@@ -46,16 +46,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT insert_iterator
   __insert_iterator_iter_t<_Container> iter;
 
 public:
-  typedef output_iterator_tag iterator_category;
-  typedef void value_type;
+  using iterator_category = output_iterator_tag;
+  using value_type        = void;
 #if _CCCL_STD_VER > 2017
-  typedef ptrdiff_t difference_type;
+  using difference_type = ptrdiff_t;
 #else
-  typedef void difference_type;
+  using difference_type = void;
 #endif
-  typedef void pointer;
-  typedef void reference;
-  typedef _Container container_type;
+  using pointer        = void;
+  using reference      = void;
+  using container_type = _Container;
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   insert_iterator(_Container& __x, __insert_iterator_iter_t<_Container> __i)
diff --git a/libcudacxx/include/cuda/std/__iterator/istream_iterator.h b/libcudacxx/include/cuda/std/__iterator/istream_iterator.h
index 687ebe69868..95cb22f734e 100644
--- a/libcudacxx/include/cuda/std/__iterator/istream_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/istream_iterator.h
@@ -40,14 +40,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT istream_iterator
   _CCCL_SUPPRESS_DEPRECATED_POP
 
 public:
-  typedef input_iterator_tag iterator_category;
-  typedef _Tp value_type;
-  typedef _Distance difference_type;
-  typedef const _Tp* pointer;
-  typedef const _Tp& reference;
-  typedef _CharT char_type;
-  typedef _Traits traits_type;
-  typedef basic_istream<_CharT, _Traits> istream_type;
+  using iterator_category = input_iterator_tag;
+  using value_type        = _Tp;
+  using difference_type   = _Distance;
+  using pointer           = const _Tp*;
+  using reference         = const _Tp&;
+  using char_type         = _CharT;
+  using traits_type       = _Traits;
+  using istream_type      = basic_istream<_CharT, _Traits>;
 
 private:
   istream_type* __in_stream_;
diff --git a/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h b/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h
index b716ea77c08..c44b79acb43 100644
--- a/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h
@@ -38,16 +38,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT istreambuf_iterator
   _CCCL_SUPPRESS_DEPRECATED_POP
 
 public:
-  typedef input_iterator_tag iterator_category;
-  typedef _CharT value_type;
-  typedef typename _Traits::off_type difference_type;
-  typedef _CharT* pointer;
-  typedef _CharT reference;
-  typedef _CharT char_type;
-  typedef _Traits traits_type;
-  typedef typename _Traits::int_type int_type;
-  typedef basic_streambuf<_CharT, _Traits> streambuf_type;
-  typedef basic_istream<_CharT, _Traits> istream_type;
+  using iterator_category = input_iterator_tag;
+  using value_type        = _CharT;
+  using difference_type   = typename _Traits::off_type;
+  using pointer           = _CharT*;
+  using reference         = _CharT;
+  using char_type         = _CharT;
+  using traits_type       = _Traits;
+  using int_type          = typename _Traits::int_type;
+  using streambuf_type    = basic_streambuf<_CharT, _Traits>;
+  using istream_type      = basic_istream<_CharT, _Traits>;
 
 private:
   mutable streambuf_type* __sbuf_;
diff --git a/libcudacxx/include/cuda/std/__iterator/iterator.h b/libcudacxx/include/cuda/std/__iterator/iterator.h
index a85bbd4ef64..8d3e722507c 100644
--- a/libcudacxx/include/cuda/std/__iterator/iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/iterator.h
@@ -28,11 +28,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Category, class _Tp, class _Distance = ptrdiff_t, class _Pointer = _Tp*, class _Reference = _Tp&>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX17 iterator
 {
-  typedef _Tp value_type;
-  typedef _Distance difference_type;
-  typedef _Pointer pointer;
-  typedef _Reference reference;
-  typedef _Category iterator_category;
+  using value_type        = _Tp;
+  using difference_type   = _Distance;
+  using pointer           = _Pointer;
+  using reference         = _Reference;
+  using iterator_category = _Category;
 };
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
index 27f9262e070..da153007b56 100644
--- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
@@ -823,11 +823,11 @@ struct __iterator_traits_impl
 template <class _Iter>
 struct __iterator_traits_impl<_Iter, true>
 {
-  typedef typename _Iter::difference_type difference_type;
-  typedef typename _Iter::value_type value_type;
-  typedef typename _Iter::pointer pointer;
-  typedef typename _Iter::reference reference;
-  typedef typename _Iter::iterator_category iterator_category;
+  using difference_type   = typename _Iter::difference_type;
+  using value_type        = typename _Iter::value_type;
+  using pointer           = typename _Iter::pointer;
+  using reference         = typename _Iter::reference;
+  using iterator_category = typename _Iter::iterator_category;
 };
 
 template <class _Iter>
@@ -855,13 +855,13 @@ template <class _Tp>
 #endif
 struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits<_Tp*>
 {
-  typedef ptrdiff_t difference_type;
-  typedef remove_cv_t<_Tp> value_type;
-  typedef _Tp* pointer;
-  typedef typename add_lvalue_reference<_Tp>::type reference;
-  typedef random_access_iterator_tag iterator_category;
+  using difference_type   = ptrdiff_t;
+  using value_type        = remove_cv_t<_Tp>;
+  using pointer           = _Tp*;
+  using reference         = typename add_lvalue_reference<_Tp>::type;
+  using iterator_category = random_access_iterator_tag;
 #if _CCCL_STD_VER >= 2014
-  typedef contiguous_iterator_tag iterator_concept;
+  using iterator_concept = contiguous_iterator_tag;
 #endif
 };
 
diff --git a/libcudacxx/include/cuda/std/__iterator/move_iterator.h b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
index 0436b25b36c..7d7c5b3a600 100644
--- a/libcudacxx/include/cuda/std/__iterator/move_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
@@ -151,16 +151,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
   using pointer         = _Iter;
   using reference       = iter_rvalue_reference_t<_Iter>;
 #else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER < 2017 vvv
-  typedef _Iter iterator_type;
-  typedef _If<__is_cpp17_random_access_iterator<_Iter>::value,
-              random_access_iterator_tag,
-              typename iterator_traits<_Iter>::iterator_category>
-    iterator_category;
-  typedef typename iterator_traits<iterator_type>::value_type value_type;
-  typedef typename iterator_traits<iterator_type>::difference_type difference_type;
-  typedef iterator_type pointer;
-  typedef typename iterator_traits<iterator_type>::reference __reference;
-  typedef conditional_t<is_reference<__reference>::value, remove_reference_t<__reference>&&, __reference> reference;
+  using iterator_type = _Iter;
+  using iterator_category =
+    _If<__is_cpp17_random_access_iterator<_Iter>::value,
+        random_access_iterator_tag,
+        typename iterator_traits<_Iter>::iterator_category>;
+  using value_type      = typename iterator_traits<iterator_type>::value_type;
+  using difference_type = typename iterator_traits<iterator_type>::difference_type;
+  using pointer         = iterator_type;
+  using __reference     = typename iterator_traits<iterator_type>::reference;
+  using reference = conditional_t<is_reference<__reference>::value, remove_reference_t<__reference>&&, __reference>;
 #endif // _CCCL_STD_VER < 2017
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit move_iterator(_Iter __i)
diff --git a/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h b/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h
index 19d70cbd183..e04f168a3ea 100644
--- a/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h
@@ -39,18 +39,18 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT ostream_iterator
   _CCCL_SUPPRESS_DEPRECATED_POP
 
 public:
-  typedef output_iterator_tag iterator_category;
-  typedef void value_type;
+  using iterator_category = output_iterator_tag;
+  using value_type        = void;
 #if _CCCL_STD_VER > 2017
-  typedef ptrdiff_t difference_type;
+  using difference_type = ptrdiff_t;
 #else
-  typedef void difference_type;
+  using difference_type = void;
 #endif
-  typedef void pointer;
-  typedef void reference;
-  typedef _CharT char_type;
-  typedef _Traits traits_type;
-  typedef basic_ostream<_CharT, _Traits> ostream_type;
+  using pointer      = void;
+  using reference    = void;
+  using char_type    = _CharT;
+  using traits_type  = _Traits;
+  using ostream_type = basic_ostream<_CharT, _Traits>;
 
 private:
   ostream_type* __out_stream_;
diff --git a/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h b/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h
index f7a7ae1966d..b62226cb7f0 100644
--- a/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h
@@ -38,19 +38,19 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT ostreambuf_iterator
   _CCCL_SUPPRESS_DEPRECATED_POP
 
 public:
-  typedef output_iterator_tag iterator_category;
-  typedef void value_type;
+  using iterator_category = output_iterator_tag;
+  using value_type        = void;
 #if _CCCL_STD_VER > 2017
-  typedef ptrdiff_t difference_type;
+  using difference_type = ptrdiff_t;
 #else
-  typedef void difference_type;
+  using difference_type = void;
 #endif
-  typedef void pointer;
-  typedef void reference;
-  typedef _CharT char_type;
-  typedef _Traits traits_type;
-  typedef basic_streambuf<_CharT, _Traits> streambuf_type;
-  typedef basic_ostream<_CharT, _Traits> ostream_type;
+  using pointer        = void;
+  using reference      = void;
+  using char_type      = _CharT;
+  using traits_type    = _Traits;
+  using streambuf_type = basic_streambuf<_CharT, _Traits>;
+  using ostream_type   = basic_ostream<_CharT, _Traits>;
 
 private:
   streambuf_type* __sbuf_;
diff --git a/libcudacxx/include/cuda/std/__iterator/wrap_iter.h b/libcudacxx/include/cuda/std/__iterator/wrap_iter.h
index 0760192de83..97e6c47d13c 100644
--- a/libcudacxx/include/cuda/std/__iterator/wrap_iter.h
+++ b/libcudacxx/include/cuda/std/__iterator/wrap_iter.h
@@ -34,14 +34,14 @@ template <class _Iter>
 class __wrap_iter
 {
 public:
-  typedef _Iter iterator_type;
-  typedef typename iterator_traits<iterator_type>::value_type value_type;
-  typedef typename iterator_traits<iterator_type>::difference_type difference_type;
-  typedef typename iterator_traits<iterator_type>::pointer pointer;
-  typedef typename iterator_traits<iterator_type>::reference reference;
-  typedef typename iterator_traits<iterator_type>::iterator_category iterator_category;
+  using iterator_type     = _Iter;
+  using value_type        = typename iterator_traits<iterator_type>::value_type;
+  using difference_type   = typename iterator_traits<iterator_type>::difference_type;
+  using pointer           = typename iterator_traits<iterator_type>::pointer;
+  using reference         = typename iterator_traits<iterator_type>::reference;
+  using iterator_category = typename iterator_traits<iterator_type>::iterator_category;
 #if _CCCL_STD_VER > 2011
-  typedef contiguous_iterator_tag iterator_concept;
+  using iterator_concept = contiguous_iterator_tag;
 #endif
 
 private:
@@ -241,9 +241,9 @@ struct __is_cpp17_contiguous_iterator<__wrap_iter<_It>> : true_type
 template <class _It>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits<__wrap_iter<_It>>
 {
-  typedef __wrap_iter<_It> pointer;
-  typedef typename pointer_traits<_It>::element_type element_type;
-  typedef typename pointer_traits<_It>::difference_type difference_type;
+  using pointer         = __wrap_iter<_It>;
+  using element_type    = typename pointer_traits<_It>::element_type;
+  using difference_type = typename pointer_traits<_It>::difference_type;
 
   _LIBCUDACXX_HIDE_FROM_ABI constexpr static element_type* to_address(pointer __w) noexcept
   {
diff --git a/libcudacxx/include/cuda/std/__linalg/conj_if_needed.h b/libcudacxx/include/cuda/std/__linalg/conj_if_needed.h
new file mode 100644
index 00000000000..c5ddcedcedb
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__linalg/conj_if_needed.h
@@ -0,0 +1,79 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _LIBCUDACXX___LINALG_CONJUGATE_IF_NEEDED_HPP
+#define _LIBCUDACXX___LINALG_CONJUGATE_IF_NEEDED_HPP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/version>
+
+#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/is_arithmetic.h>
+#  include <cuda/std/complex>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+namespace linalg
+{
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CPO(__conj_if_needed)
+
+template <class _Type>
+_CCCL_CONCEPT _HasConj = _CCCL_REQUIRES_EXPR((_Type), _Type __a)(static_cast<void>(_CUDA_VSTD::conj(__a)));
+
+struct __conj_if_needed
+{
+  template <class _Type>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(const _Type& __t) const
+  {
+    if constexpr (is_arithmetic_v<_Type> || !_HasConj<_Type>)
+    {
+      return __t;
+    }
+    else
+    {
+      return _CUDA_VSTD::conj(__t);
+    }
+    _CCCL_UNREACHABLE();
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CPO
+
+inline namespace __cpo
+{
+_CCCL_GLOBAL_CONSTANT auto conj_if_needed = __conj_if_needed::__conj_if_needed{};
+
+} // namespace __cpo
+} // end namespace linalg
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+#endif // _LIBCUDACXX___LINALG_CONJUGATED_HPP
diff --git a/libcudacxx/include/cuda/std/__linalg/conjugate_transposed.h b/libcudacxx/include/cuda/std/__linalg/conjugate_transposed.h
new file mode 100644
index 00000000000..ab984c78152
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__linalg/conjugate_transposed.h
@@ -0,0 +1,56 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _LIBCUDACXX___LINALG_CONJUGATE_TRANSPOSED_HPP
+#define _LIBCUDACXX___LINALG_CONJUGATE_TRANSPOSED_HPP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/version>
+
+#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/__linalg/conjugated.h>
+#  include <cuda/std/__linalg/transposed.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+namespace linalg
+{
+
+template <class _ElementType, class _Extents, class _Layout, class _Accessor>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+conjugate_transposed(mdspan<_ElementType, _Extents, _Layout, _Accessor> __a)
+{
+  return conjugated(transposed(__a));
+}
+
+} // end namespace linalg
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+#endif // _LIBCUDACXX___LINALG_CONJUGATE_TRANSPOSED_HPP
diff --git a/libcudacxx/include/cuda/std/__linalg/conjugated.h b/libcudacxx/include/cuda/std/__linalg/conjugated.h
new file mode 100644
index 00000000000..8604ccdc1a7
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__linalg/conjugated.h
@@ -0,0 +1,142 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _LIBCUDACXX___LINALG_CONJUGATED_HPP
+#define _LIBCUDACXX___LINALG_CONJUGATED_HPP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/version>
+
+#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/__linalg/conj_if_needed.h>
+#  include <cuda/std/__type_traits/add_const.h>
+#  include <cuda/std/__type_traits/is_arithmetic.h>
+#  include <cuda/std/__type_traits/remove_const.h>
+#  include <cuda/std/__utility/declval.h>
+#  include <cuda/std/mdspan>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+namespace linalg
+{
+
+template <class _NestedAccessor>
+class conjugated_accessor
+{
+private:
+  using __nested_element_type = typename _NestedAccessor::element_type;
+  using __nc_result_type      = decltype(conj_if_needed(_CUDA_VSTD::declval<__nested_element_type>()));
+
+public:
+  using element_type     = add_const_t<__nc_result_type>;
+  using reference        = remove_const_t<element_type>;
+  using data_handle_type = typename _NestedAccessor::data_handle_type;
+  using offset_policy    = conjugated_accessor<typename _NestedAccessor::offset_policy>;
+
+  _CCCL_HIDE_FROM_ABI constexpr conjugated_accessor() = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr conjugated_accessor(const _NestedAccessor& __acc)
+      : __nested_accessor_(__acc)
+  {}
+
+  _CCCL_TEMPLATE(class _OtherNestedAccessor)
+  _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&)
+                   _CCCL_AND _CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr conjugated_accessor(const conjugated_accessor<_OtherNestedAccessor>& __other)
+      : __nested_accessor_(__other.nested_accessor())
+  {}
+
+  _CCCL_TEMPLATE(class _OtherNestedAccessor)
+  _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&)
+                   _CCCL_AND(!_CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor)))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr conjugated_accessor(
+    const conjugated_accessor<_OtherNestedAccessor>& __other)
+      : __nested_accessor_(__other.nested_accessor())
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference access(data_handle_type __p, size_t __i) const noexcept
+  {
+    return conj_if_needed(__nested_element_type(__nested_accessor_.access(__p, __i)));
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr typename offset_policy::data_handle_type
+  offset(data_handle_type __p, size_t __i) const noexcept
+  {
+    return __nested_accessor_.offset(__p, __i);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _NestedAccessor& nested_accessor() const noexcept
+  {
+    return __nested_accessor_;
+  }
+
+private:
+  _NestedAccessor __nested_accessor_;
+};
+
+template <class _ElementType, class _Extents, class _Layout, class _Accessor>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+conjugated(mdspan<_ElementType, _Extents, _Layout, _Accessor> __a)
+{
+  using __value_type = typename decltype(__a)::value_type;
+  // Current status of [linalg] only optimizes if _Accessor is conjugated_accessor<_Accessor> for some _Accessor.
+  // There's a separate specialization for that case below.
+
+  // P3050 optimizes conjugated's accessor type for when we know that it can't be complex: arithmetic types,
+  // and types for which `conj` is not ADL-findable.
+  if constexpr (is_arithmetic_v<__value_type> || !__conj_if_needed::_HasConj<__value_type>)
+  {
+    return mdspan<_ElementType, _Extents, _Layout, _Accessor>(__a.data_handle(), __a.mapping(), __a.accessor());
+  }
+  else
+  {
+    using __return_element_type  = typename conjugated_accessor<_Accessor>::element_type;
+    using __return_accessor_type = conjugated_accessor<_Accessor>;
+    return mdspan<__return_element_type, _Extents, _Layout, __return_accessor_type>{
+      __a.data_handle(), __a.mapping(), __return_accessor_type(__a.accessor())};
+  }
+  _CCCL_UNREACHABLE();
+}
+
+// Conjugation is self-annihilating
+template <class _ElementType, class _Extents, class _Layout, class _NestedAccessor>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+conjugated(mdspan<_ElementType, _Extents, _Layout, conjugated_accessor<_NestedAccessor>> __a)
+{
+  using __return_element_type  = typename _NestedAccessor::element_type;
+  using __return_accessor_type = _NestedAccessor;
+  return mdspan<__return_element_type, _Extents, _Layout, __return_accessor_type>(
+    __a.data_handle(), __a.mapping(), __a.accessor().nested_accessor());
+}
+
+} // end namespace linalg
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+#endif // _LIBCUDACXX___LINALG_CONJUGATED_HPP
diff --git a/libcudacxx/include/cuda/std/__linalg/scaled.h b/libcudacxx/include/cuda/std/__linalg/scaled.h
new file mode 100644
index 00000000000..eabd7a6d520
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__linalg/scaled.h
@@ -0,0 +1,135 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _LIBCUDACXX___LINALG_SCALED_HPP
+#define _LIBCUDACXX___LINALG_SCALED_HPP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/version>
+
+#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/add_const.h>
+#  include <cuda/std/__type_traits/remove_const.h>
+#  include <cuda/std/__utility/declval.h>
+#  include <cuda/std/mdspan>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+namespace linalg
+{
+
+template <class _ScalingFactor, class _NestedAccessor>
+class scaled_accessor
+{
+public:
+  using element_type = add_const_t<
+    decltype(_CUDA_VSTD::declval<_ScalingFactor>() * _CUDA_VSTD::declval<typename _NestedAccessor::element_type>())>;
+  using reference        = remove_const_t<element_type>;
+  using data_handle_type = typename _NestedAccessor::data_handle_type;
+  using offset_policy    = scaled_accessor<_ScalingFactor, typename _NestedAccessor::offset_policy>;
+
+  _CCCL_HIDE_FROM_ABI constexpr scaled_accessor() = default;
+
+  _CCCL_TEMPLATE(class _OtherScalingFactor, class _OtherNestedAccessor)
+  _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&)
+                   _CCCL_AND _CCCL_TRAIT(is_constructible, _ScalingFactor, _OtherScalingFactor)
+                     _CCCL_AND(!_CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor)))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr scaled_accessor(
+    const scaled_accessor<_OtherScalingFactor, _OtherNestedAccessor>& __other)
+      : __scaling_factor_(__other.scaling_factor())
+      , __nested_accessor_(__other.nested_accessor())
+  {}
+
+  _CCCL_TEMPLATE(class _OtherScalingFactor, class _OtherNestedAccessor)
+  _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&)
+                   _CCCL_AND _CCCL_TRAIT(is_constructible, _ScalingFactor, _OtherScalingFactor)
+                     _CCCL_AND _CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr scaled_accessor(
+    const scaled_accessor<_OtherScalingFactor, _OtherNestedAccessor>& __other)
+      : __scaling_factor_(__other.scaling_factor())
+      , __nested_accessor_(__other.nested_accessor())
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr scaled_accessor(const _ScalingFactor& __s, const _NestedAccessor& __a)
+      : __scaling_factor_(__s)
+      , __nested_accessor_(__a)
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference access(data_handle_type __p, size_t __i) const
+  {
+    return __scaling_factor_ * typename _NestedAccessor::element_type(__nested_accessor_.access(__p, __i));
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI
+  typename offset_policy::data_handle_type constexpr offset(data_handle_type __p, size_t __i) const
+  {
+    return __nested_accessor_.offset(__p, __i);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _NestedAccessor nested_accessor() const noexcept
+  {
+    return __nested_accessor_;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _ScalingFactor scaling_factor() const noexcept
+  {
+    return __scaling_factor_;
+  }
+
+private:
+  _ScalingFactor __scaling_factor_;
+  _NestedAccessor __nested_accessor_;
+};
+
+namespace __detail
+{
+
+template <class _ScalingFactor, class _NestedAccessor>
+using __scaled_element_type = add_const_t<typename scaled_accessor<_ScalingFactor, _NestedAccessor>::element_type>;
+
+} // namespace __detail
+
+template <class _ScalingFactor, class _ElementType, class _Extents, class _Layout, class _Accessor>
+_CCCL_NODISCARD
+_LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan<__detail::__scaled_element_type<_ScalingFactor, _Accessor>,
+                                           _Extents,
+                                           _Layout,
+                                           scaled_accessor<_ScalingFactor, _Accessor>>
+scaled(_ScalingFactor __scaling_factor, mdspan<_ElementType, _Extents, _Layout, _Accessor> __x)
+{
+  using __acc_type = scaled_accessor<_ScalingFactor, _Accessor>;
+  return {__x.data_handle(), __x.mapping(), __acc_type{__scaling_factor, __x.accessor()}};
+}
+
+} // end namespace linalg
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+#endif // _LIBCUDACXX___LINALG_SCALED_HPP
diff --git a/libcudacxx/include/cuda/std/__linalg/transposed.h b/libcudacxx/include/cuda/std/__linalg/transposed.h
new file mode 100644
index 00000000000..707cfa8bfe8
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__linalg/transposed.h
@@ -0,0 +1,330 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _LIBCUDACXX___LINALG_TRANSPOSED_HPP
+#define _LIBCUDACXX___LINALG_TRANSPOSED_HPP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/version>
+
+#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/is_convertible.h>
+#  include <cuda/std/__type_traits/is_same.h>
+#  include <cuda/std/array>
+#  include <cuda/std/mdspan>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+namespace linalg
+{
+
+namespace __detail
+{
+// This struct helps us impose the rank constraint on the __type alias itself.
+_CCCL_TEMPLATE(class _Extents)
+_CCCL_REQUIRES((_Extents::rank() == 2))
+struct __transpose_extents_t_impl
+{
+  using __type = extents<typename _Extents::index_type, _Extents::static_extent(1), _Extents::static_extent(0)>;
+};
+
+template <class _Extents>
+using __transpose_extents_t = typename __transpose_extents_t_impl<_Extents>::__type;
+
+_CCCL_TEMPLATE(class _Extents)
+_CCCL_REQUIRES((_Extents::rank() == 2))
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __transpose_extents_t<_Extents> __transpose_extents(const _Extents& __e)
+{
+  static_assert(is_same_v<typename __transpose_extents_t<_Extents>::index_type, typename _Extents::index_type>,
+                "Please fix __transpose_extents_t to account for P2553, which adds a template parameter SizeType to "
+                "extents.");
+  constexpr size_t __ext0 = _Extents::static_extent(0);
+  constexpr size_t __ext1 = _Extents::static_extent(1);
+  if constexpr (__ext0 == dynamic_extent)
+  {
+    if constexpr (__ext1 == dynamic_extent)
+    {
+      return __transpose_extents_t<_Extents>{__e.extent(1), __e.extent(0)};
+    }
+    else
+    {
+      return __transpose_extents_t<_Extents>{/* __e.extent(1), */ __e.extent(0)};
+    }
+  }
+  else
+  {
+    if constexpr (__ext1 == dynamic_extent)
+    {
+      return __transpose_extents_t<_Extents>{__e.extent(1) /* , __e.extent(0) */};
+    }
+    else
+    {
+      return __transpose_extents_t<_Extents>{}; // all extents are static
+    }
+  }
+  _CCCL_UNREACHABLE(); // GCC9 workaround
+}
+
+} // namespace __detail
+
+template <class _Layout>
+class layout_transpose
+{
+public:
+  using nested_layout_type = _Layout;
+
+  template <class _Extents>
+  struct mapping
+  {
+  private:
+    using __nested_mapping_type = typename _Layout::template mapping<__detail::__transpose_extents_t<_Extents>>;
+
+    static constexpr bool __required_span_size_noexcept = noexcept(__nested_mapping_type{}.required_span_size());
+
+    static constexpr bool __is_nested_unique_noexcept = noexcept(__nested_mapping_type{}.is_unique());
+
+    static constexpr bool __is_exhaustive_noexcept = noexcept(__nested_mapping_type{}.is_exhaustive());
+
+    static constexpr bool __is_strided_noexcept = noexcept(__nested_mapping_type{}.is_strided());
+
+  public:
+    using extents_type = _Extents;
+    using index_type   = typename extents_type::index_type;
+    using size_type    = typename extents_type::size_type;
+    using rank_type    = typename extents_type::rank_type;
+    using layout_type  = layout_transpose;
+
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit mapping(const __nested_mapping_type& __map)
+        : __nested_mapping_(__map)
+        , __extents_(__detail::__transpose_extents(__map.extents()))
+    {}
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept
+    {
+      return __extents_;
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type required_span_size() const
+      noexcept(__required_span_size_noexcept)
+    {
+      return __nested_mapping_.required_span_size();
+    }
+
+    _CCCL_TEMPLATE(class _IndexType0, class _IndexType1)
+    _CCCL_REQUIRES(_CCCL_TRAIT(is_convertible, _IndexType0, index_type)
+                     _CCCL_AND _CCCL_TRAIT(is_convertible, _IndexType1, index_type))
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type operator()(_IndexType0 __i, _IndexType1 __j) const
+    {
+      return __nested_mapping_(__j, __i);
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const __nested_mapping_type& nested_mapping() const noexcept
+    {
+      return __nested_mapping_;
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_unique() noexcept
+    {
+      return __nested_mapping_type::is_always_unique();
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() noexcept
+    {
+      return __nested_mapping_type::is_always_contiguous();
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_strided() noexcept
+    {
+      return __nested_mapping_type::is_always_strided();
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_unique() const noexcept(__is_nested_unique_noexcept)
+    {
+      return __nested_mapping_.is_unique();
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_exhaustive() const noexcept(__is_exhaustive_noexcept)
+    {
+      return __nested_mapping_.is_exhaustive();
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_strided() const noexcept(__is_strided_noexcept)
+    {
+      return __nested_mapping_.is_strided();
+    }
+
+    _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type stride(size_t __r) const
+    {
+      _CCCL_ASSERT(this->is_strided(), "layout must be strided");
+      _CCCL_ASSERT(__r < extents_type::rank(), "rank must be less than extents rank");
+      return __nested_mapping_.stride(__r == 0 ? 1 : 0);
+    }
+
+    template <class _OtherExtents>
+    _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
+    operator==(const mapping& __lhs, const mapping<_OtherExtents>& __rhs) noexcept
+    {
+      return __lhs.__nested_mapping_ == __rhs.__nested_mapping_;
+    }
+
+    template <class _OtherExtents>
+    _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
+    operator!=(const mapping& __lhs, const mapping<_OtherExtents>& __rhs) noexcept
+    {
+      return __lhs.__nested_mapping_ != __rhs.__nested_mapping_;
+    }
+
+  private:
+    __nested_mapping_type __nested_mapping_;
+    extents_type __extents_;
+  };
+};
+
+namespace __detail
+{
+
+template <class _ElementType, class _Accessor>
+struct __transposed_element_accessor
+{
+  using __element_type  = _ElementType;
+  using __accessor_type = _Accessor;
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr __accessor_type __accessor(const _Accessor& __a)
+  {
+    return __accessor_type(__a);
+  }
+};
+
+template <class _ElementType>
+struct __transposed_element_accessor<_ElementType, default_accessor<_ElementType>>
+{
+  using __element_type  = _ElementType;
+  using __accessor_type = default_accessor<__element_type>;
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr __accessor_type __accessor(const default_accessor<_ElementType>& __a)
+  {
+    return __accessor_type(__a);
+  }
+};
+
+template <class _Layout>
+struct __transposed_layout
+{
+  using __layout_type = layout_transpose<_Layout>;
+
+  template <class __OriginalMapping>
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __mapping(const __OriginalMapping& __orig_map)
+  {
+    using __extents_type        = __transpose_extents_t<typename __OriginalMapping::__extents_type>;
+    using __return_mapping_type = typename __layout_type::template __mapping<__extents_type>;
+    return __return_mapping_type{__orig_map};
+  }
+};
+
+template <>
+struct __transposed_layout<layout_left>
+{
+  using __layout_type = layout_right;
+
+  template <class _OriginalExtents>
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto
+  __mapping(const typename layout_left::template mapping<_OriginalExtents>& __orig_map)
+  {
+    using __original_mapping_type = typename layout_left::template mapping<_OriginalExtents>;
+    using __extents_type          = __transpose_extents_t<typename __original_mapping_type::extents_type>;
+    using __return_mapping_type   = typename __layout_type::template mapping<__extents_type>;
+    return __return_mapping_type{__transpose_extents(__orig_map.extents())};
+  }
+};
+
+template <>
+struct __transposed_layout<layout_right>
+{
+  using __layout_type = layout_left;
+
+  template <class _OriginalExtents>
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto
+  __mapping(const typename layout_right::template mapping<_OriginalExtents>& __orig_map)
+  {
+    using __original_mapping_type = typename layout_right::template mapping<_OriginalExtents>;
+    using __extents_type          = __transpose_extents_t<typename __original_mapping_type::extents_type>;
+    using __return_mapping_type   = typename __layout_type::template mapping<__extents_type>;
+    return __return_mapping_type{__transpose_extents(__orig_map.extents())};
+  }
+};
+
+template <>
+struct __transposed_layout<layout_stride>
+{
+  using __layout_type = layout_stride;
+
+  template <class _OriginalExtents>
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto
+  __mapping(const typename layout_stride::template mapping<_OriginalExtents>& __orig_map)
+  {
+    using __original_mapping_type = typename layout_stride::template mapping<_OriginalExtents>;
+    using __original_extents_type = typename __original_mapping_type::extents_type;
+    using __extents_type          = __transpose_extents_t<__original_extents_type>;
+    using __return_mapping_type   = typename __layout_type::template mapping<__extents_type>;
+    return __return_mapping_type{
+      __transpose_extents(__orig_map.extents()),
+      array<typename __extents_type::index_type, _OriginalExtents::rank() /* __orig_map.rank() */>{
+        __orig_map.stride(1), __orig_map.stride(0)}};
+  }
+};
+
+// TODO add support for padded layouts
+
+template <class _NestedLayout>
+struct __transposed_layout<layout_transpose<_NestedLayout>>
+{
+  using __layout_type = _NestedLayout;
+};
+
+} // namespace __detail
+
+template <class _ElementType, class _Extents, class _Layout, class _Accessor>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
+transposed(mdspan<_ElementType, _Extents, _Layout, _Accessor> __a)
+{
+  using __element_type  = typename __detail::__transposed_element_accessor<_ElementType, _Accessor>::__element_type;
+  using __layout_type   = typename __detail::__transposed_layout<_Layout>::__layout_type;
+  using __accessor_type = typename __detail::__transposed_element_accessor<_ElementType, _Accessor>::__accessor_type;
+  auto __mapping        = __detail::__transposed_layout<_Layout>::__mapping(__a.mapping());
+  auto __accessor       = __detail::__transposed_element_accessor<_ElementType, _Accessor>::__accessor(__a.accessor());
+  return mdspan<__element_type, typename decltype(__mapping)::extents_type, __layout_type, __accessor_type>{
+    __a.data_handle(), __mapping, __accessor};
+}
+
+} // end namespace linalg
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017
+#endif // _LIBCUDACXX___LINALG_TRANSPOSED_HPP
diff --git a/libcudacxx/include/cuda/std/__memory/allocator.h b/libcudacxx/include/cuda/std/__memory/allocator.h
index c771226e191..ae90ebc0d72 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator.h
@@ -49,14 +49,14 @@ template <>
 class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<void>
 {
 public:
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef void* pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void* const_pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef void value_type;
+  using pointer _LIBCUDACXX_DEPRECATED_IN_CXX17       = void*;
+  using const_pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = const void*;
+  using value_type _LIBCUDACXX_DEPRECATED_IN_CXX17    = void;
 
   template <class _Up>
   struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind
   {
-    typedef allocator<_Up> other;
+    using other = allocator<_Up>;
   };
 };
 
@@ -64,14 +64,14 @@ template <>
 class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<const void>
 {
 public:
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void* pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void* const_pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void value_type;
+  using pointer _LIBCUDACXX_DEPRECATED_IN_CXX17       = const void*;
+  using const_pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = const void*;
+  using value_type _LIBCUDACXX_DEPRECATED_IN_CXX17    = const void;
 
   template <class _Up>
   struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind
   {
-    typedef allocator<_Up> other;
+    using other = allocator<_Up>;
   };
 };
 #endif // _CCCL_STD_VER <= 2017
@@ -109,11 +109,11 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if<!_CCCL_
   static_assert(!_CCCL_TRAIT(is_volatile, _Tp), "std::allocator does not support volatile types");
 
 public:
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef _Tp value_type;
-  typedef true_type propagate_on_container_move_assignment;
-  typedef true_type is_always_equal;
+  using size_type                              = size_t;
+  using difference_type                        = ptrdiff_t;
+  using value_type                             = _Tp;
+  using propagate_on_container_move_assignment = true_type;
+  using is_always_equal                        = true_type;
 
   _CCCL_CONSTEXPR_CXX20 allocator() noexcept = default;
 
@@ -163,15 +163,15 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if<!_CCCL_
 
   // C++20 Removed members
 #if _CCCL_STD_VER <= 2017
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef _Tp* pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef _Tp& reference;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp& const_reference;
+  using pointer _LIBCUDACXX_DEPRECATED_IN_CXX17         = _Tp*;
+  using const_pointer _LIBCUDACXX_DEPRECATED_IN_CXX17   = const _Tp*;
+  using reference _LIBCUDACXX_DEPRECATED_IN_CXX17       = _Tp&;
+  using const_reference _LIBCUDACXX_DEPRECATED_IN_CXX17 = const _Tp&;
 
   template <class _Up>
   struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind
   {
-    typedef allocator<_Up> other;
+    using other = allocator<_Up>;
   };
 
   _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI pointer address(reference __x) const noexcept
@@ -213,11 +213,11 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<const _Tp>
   static_assert(!_CCCL_TRAIT(is_volatile, _Tp), "std::allocator does not support volatile types");
 
 public:
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef const _Tp value_type;
-  typedef true_type propagate_on_container_move_assignment;
-  typedef true_type is_always_equal;
+  using size_type                              = size_t;
+  using difference_type                        = ptrdiff_t;
+  using value_type                             = const _Tp;
+  using propagate_on_container_move_assignment = true_type;
+  using is_always_equal                        = true_type;
 
   _CCCL_CONSTEXPR_CXX20 allocator() noexcept = default;
 
@@ -262,15 +262,15 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<const _Tp>
 
   // C++20 Removed members
 #if _CCCL_STD_VER <= 2017
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp* pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp& reference;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp& const_reference;
+  using pointer _LIBCUDACXX_DEPRECATED_IN_CXX17         = const _Tp*;
+  using const_pointer _LIBCUDACXX_DEPRECATED_IN_CXX17   = const _Tp*;
+  using reference _LIBCUDACXX_DEPRECATED_IN_CXX17       = const _Tp&;
+  using const_reference _LIBCUDACXX_DEPRECATED_IN_CXX17 = const _Tp&;
 
   template <class _Up>
   struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind
   {
-    typedef allocator<_Up> other;
+    using other = allocator<_Up>;
   };
 
   _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI const_pointer address(const_reference __x) const noexcept
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h b/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h
index ae88ce57615..d1ca1ab83a7 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h
@@ -42,10 +42,10 @@ _CCCL_INLINE_VAR constexpr allocator_arg_t allocator_arg = allocator_arg_t();
 template <class _Tp, class _Alloc, class... _Args>
 struct __uses_alloc_ctor_imp
 {
-  typedef _CCCL_NODEBUG_ALIAS remove_cvref_t<_Alloc> _RawAlloc;
-  static const bool __ua = uses_allocator<_Tp, _RawAlloc>::value;
-  static const bool __ic = is_constructible<_Tp, allocator_arg_t, _Alloc, _Args...>::value;
-  static const int value = __ua ? 2 - __ic : 0;
+  using _RawAlloc _CCCL_NODEBUG_ALIAS = remove_cvref_t<_Alloc>;
+  static const bool __ua              = uses_allocator<_Tp, _RawAlloc>::value;
+  static const bool __ic              = is_constructible<_Tp, allocator_arg_t, _Alloc, _Args...>::value;
+  static const int value              = __ua ? 2 - __ic : 0;
 };
 
 template <class _Tp, class _Alloc, class... _Args>
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_destructor.h b/libcudacxx/include/cuda/std/__memory/allocator_destructor.h
index e6fc850a086..a850e834282 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_destructor.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_destructor.h
@@ -29,11 +29,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Alloc>
 class __allocator_destructor
 {
-  typedef _CCCL_NODEBUG_ALIAS allocator_traits<_Alloc> __alloc_traits;
+  using __alloc_traits _CCCL_NODEBUG_ALIAS = allocator_traits<_Alloc>;
 
 public:
-  typedef _CCCL_NODEBUG_ALIAS typename __alloc_traits::pointer pointer;
-  typedef _CCCL_NODEBUG_ALIAS typename __alloc_traits::size_type size_type;
+  using pointer _CCCL_NODEBUG_ALIAS   = typename __alloc_traits::pointer;
+  using size_type _CCCL_NODEBUG_ALIAS = typename __alloc_traits::size_type;
 
 private:
   _Alloc& __alloc_;
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_traits.h b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
index 035731687a3..a22e5b09695 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_traits.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
@@ -35,7 +35,7 @@
 #include <cuda/std/__type_traits/void_t.h>
 #include <cuda/std/__utility/declval.h>
 #include <cuda/std/__utility/forward.h>
-#include <cuda/std/detail/libcxx/include/cstring>
+#include <cuda/std/cstring>
 #include <cuda/std/limits>
 
 _CCCL_PUSH_MACROS
@@ -567,7 +567,7 @@ using __rebind_alloc _CCCL_NODEBUG_ALIAS = typename _Traits::template rebind_all
 template <class _Traits, class _Tp>
 struct __rebind_alloc_helper
 {
-  typedef _CCCL_NODEBUG_ALIAS typename _Traits::template rebind_alloc<_Tp> type;
+  using type _CCCL_NODEBUG_ALIAS = typename _Traits::template rebind_alloc<_Tp>;
 };
 
 #undef _LIBCUDACXX_ALLOCATOR_TRAITS_HAS_XXX
diff --git a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
index 5752a48ec04..445c0166779 100644
--- a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
+++ b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
@@ -36,7 +36,7 @@ struct __builtin_new_allocator
 {
   struct __builtin_new_deleter
   {
-    typedef void* pointer_type;
+    using pointer_type = void*;
 
     _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __builtin_new_deleter(size_t __size, size_t __align) noexcept
         : __size_(__size)
@@ -53,7 +53,7 @@ struct __builtin_new_allocator
     size_t __align_;
   };
 
-  typedef unique_ptr<void, __builtin_new_deleter> __holder_t;
+  using __holder_t = unique_ptr<void, __builtin_new_deleter>;
 
   _LIBCUDACXX_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align)
   {
diff --git a/libcudacxx/include/cuda/std/__memory/pointer_traits.h b/libcudacxx/include/cuda/std/__memory/pointer_traits.h
index d102dde7a74..cd04ccfaad9 100644
--- a/libcudacxx/include/cuda/std/__memory/pointer_traits.h
+++ b/libcudacxx/include/cuda/std/__memory/pointer_traits.h
@@ -49,19 +49,19 @@ struct __pointer_traits_element_type;
 template <class _Ptr>
 struct __pointer_traits_element_type<_Ptr, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS typename _Ptr::element_type type;
+  using type _CCCL_NODEBUG_ALIAS = typename _Ptr::element_type;
 };
 
 template <template <class, class...> class _Sp, class _Tp, class... _Args>
 struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS typename _Sp<_Tp, _Args...>::element_type type;
+  using type _CCCL_NODEBUG_ALIAS = typename _Sp<_Tp, _Args...>::element_type;
 };
 
 template <template <class, class...> class _Sp, class _Tp, class... _Args>
 struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, false>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 
 template <class _Tp, class = void>
@@ -75,13 +75,13 @@ struct __has_difference_type<_Tp, void_t<typename _Tp::difference_type>> : true_
 template <class _Ptr, bool = __has_difference_type<_Ptr>::value>
 struct __pointer_traits_difference_type
 {
-  typedef _CCCL_NODEBUG_ALIAS ptrdiff_t type;
+  using type _CCCL_NODEBUG_ALIAS = ptrdiff_t;
 };
 
 template <class _Ptr>
 struct __pointer_traits_difference_type<_Ptr, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS typename _Ptr::difference_type type;
+  using type _CCCL_NODEBUG_ALIAS = typename _Ptr::difference_type;
 };
 
 template <class _Tp, class _Up>
@@ -102,27 +102,27 @@ struct __has_rebind
 template <class _Tp, class _Up, bool = __has_rebind<_Tp, _Up>::value>
 struct __pointer_traits_rebind
 {
-  typedef _CCCL_NODEBUG_ALIAS typename _Tp::template rebind<_Up> type;
+  using type _CCCL_NODEBUG_ALIAS = typename _Tp::template rebind<_Up>;
 };
 
 template <template <class, class...> class _Sp, class _Tp, class... _Args, class _Up>
 struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS typename _Sp<_Tp, _Args...>::template rebind<_Up> type;
+  using type _CCCL_NODEBUG_ALIAS = typename _Sp<_Tp, _Args...>::template rebind<_Up>;
 };
 
 template <template <class, class...> class _Sp, class _Tp, class... _Args, class _Up>
 struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, false>
 {
-  typedef _Sp<_Up, _Args...> type;
+  using type = _Sp<_Up, _Args...>;
 };
 
 template <class _Ptr>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits
 {
-  typedef _Ptr pointer;
-  typedef typename __pointer_traits_element_type<pointer>::type element_type;
-  typedef typename __pointer_traits_difference_type<pointer>::type difference_type;
+  using pointer         = _Ptr;
+  using element_type    = typename __pointer_traits_element_type<pointer>::type;
+  using difference_type = typename __pointer_traits_difference_type<pointer>::type;
 
   template <class _Up>
   using rebind = typename __pointer_traits_rebind<pointer, _Up>::type;
@@ -142,9 +142,9 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits<_Tp*>
 {
-  typedef _Tp* pointer;
-  typedef _Tp element_type;
-  typedef ptrdiff_t difference_type;
+  using pointer         = _Tp*;
+  using element_type    = _Tp;
+  using difference_type = ptrdiff_t;
 
   template <class _Up>
   using rebind = _Up*;
@@ -164,7 +164,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits<_Tp*>
 template <class _From, class _To>
 struct __rebind_pointer
 {
-  typedef typename pointer_traits<_From>::template rebind<_To> type;
+  using type = typename pointer_traits<_From>::template rebind<_To>;
 };
 
 // to_address
diff --git a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
index 11b476ba76c..ea173797125 100644
--- a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
+++ b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
@@ -97,8 +97,8 @@ template <class _InputIterator, class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
 uninitialized_copy(_InputIterator __ifirst, _InputIterator __ilast, _ForwardIterator __ofirst)
 {
-  typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
-  auto __result = _CUDA_VSTD::__uninitialized_copy<_ValueType>(
+  using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+  auto __result    = _CUDA_VSTD::__uninitialized_copy<_ValueType>(
     _CUDA_VSTD::move(__ifirst), _CUDA_VSTD::move(__ilast), _CUDA_VSTD::move(__ofirst), __always_false{});
   return _CUDA_VSTD::move(__result.second);
 }
@@ -124,8 +124,8 @@ template <class _InputIterator, class _Size, class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
 uninitialized_copy_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofirst)
 {
-  typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
-  auto __result = _CUDA_VSTD::__uninitialized_copy_n<_ValueType>(
+  using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+  auto __result    = _CUDA_VSTD::__uninitialized_copy_n<_ValueType>(
     _CUDA_VSTD::move(__ifirst), __n, _CUDA_VSTD::move(__ofirst), __always_false{});
   return _CUDA_VSTD::move(__result.second);
 }
@@ -150,7 +150,7 @@ __uninitialized_fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __x)
 template <class _ForwardIterator, class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI void uninitialized_fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __x)
 {
-  typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
+  using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
   (void) _CUDA_VSTD::__uninitialized_fill<_ValueType>(__first, __last, __x);
 }
 
@@ -173,7 +173,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator __uninitialized_fill_n(_ForwardIterat
 template <class _ForwardIterator, class _Size, class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x)
 {
-  typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
+  using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
   return _CUDA_VSTD::__uninitialized_fill_n<_ValueType>(__first, __n, __x);
 }
 
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h b/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h
index 3da59117761..458606da346 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h
@@ -40,43 +40,43 @@
 
 _CCCL_PUSH_MACROS
 
-typedef ::timespec __cccl_timespec_t;
+using __cccl_timespec_t = ::timespec;
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Mutex
-typedef pthread_mutex_t __cccl_mutex_t;
+using __cccl_mutex_t = pthread_mutex_t;
 #  define _LIBCUDACXX_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 
-typedef pthread_mutex_t __cccl_recursive_mutex_t;
+using __cccl_recursive_mutex_t = pthread_mutex_t;
 
 // Condition Variable
-typedef pthread_cond_t __cccl_condvar_t;
+using __cccl_condvar_t = pthread_cond_t;
 #  define _LIBCUDACXX_CONDVAR_INITIALIZER PTHREAD_COND_INITIALIZER
 
 // Semaphore
 #  if defined(__APPLE__)
-typedef dispatch_semaphore_t __cccl_semaphore_t;
+using __cccl_semaphore_t = dispatch_semaphore_t;
 #    define _LIBCUDACXX_SEMAPHORE_MAX numeric_limits<long>::max()
 #  else // ^^^ __APPLE__ ^^^ / vvv !__APPLE__ vvv
-typedef sem_t __cccl_semaphore_t;
+using __cccl_semaphore_t = sem_t;
 #    define _LIBCUDACXX_SEMAPHORE_MAX SEM_VALUE_MAX
 #  endif // !__APPLE__
 
 // Execute once
-typedef pthread_once_t __cccl_exec_once_flag;
+using __cccl_exec_once_flag = pthread_once_t;
 #  define _LIBCUDACXX_EXEC_ONCE_INITIALIZER PTHREAD_ONCE_INIT
 
 // Thread id
-typedef pthread_t __cccl_thread_id;
+using __cccl_thread_id = pthread_t;
 
 // Thread
 #  define _LIBCUDACXX_NULL_THREAD 0U
 
-typedef pthread_t __cccl_thread_t;
+using __cccl_thread_t = pthread_t;
 
 // Thread Local Storage
-typedef pthread_key_t __cccl_tls_key;
+using __cccl_tls_key = pthread_key_t;
 
 #  define _LIBCUDACXX_TLS_DESTRUCTOR_CC
 
@@ -85,7 +85,7 @@ _LIBCUDACXX_HIDE_FROM_ABI __cccl_timespec_t __cccl_to_timespec(const _CUDA_VSTD:
   using namespace chrono;
   seconds __s = duration_cast<seconds>(__ns);
   __cccl_timespec_t __ts;
-  typedef decltype(__ts.tv_sec) ts_sec;
+  using ts_sec                  = decltype(__ts.tv_sec);
   constexpr ts_sec __ts_sec_max = numeric_limits<ts_sec>::max();
 
   if (__s.count() < __ts_sec_max)
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_win32.h b/libcudacxx/include/cuda/std/__thread/threading_support_win32.h
index 1ad116aa3e1..49d1d947b64 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support_win32.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support_win32.h
@@ -32,38 +32,38 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Mutex
-typedef void* __cccl_mutex_t;
+using __cccl_mutex_t = void*;
 #  define _LIBCUDACXX_MUTEX_INITIALIZER 0
 
 #  if defined(_M_IX86) || defined(__i386__) || defined(_M_ARM) || defined(__arm__)
-typedef void* __cccl_recursive_mutex_t[6];
+using __cccl_recursive_mutex_t = void* [6];
 #  elif defined(_M_AMD64) || defined(__x86_64__) || defined(_M_ARM64) || defined(__aarch64__)
-typedef void* __cccl_recursive_mutex_t[5];
+using __cccl_recursive_mutex_t = void* [5];
 #  else
 #    error Unsupported architecture
 #  endif
 
 // Condition Variable
-typedef void* __cccl_condvar_t;
+using __cccl_condvar_t = void*;
 #  define _LIBCUDACXX_CONDVAR_INITIALIZER 0
 
 // Semaphore
-typedef void* __cccl_semaphore_t;
+using __cccl_semaphore_t = void*;
 
 // Execute Once
-typedef void* __cccl_exec_once_flag;
+using __cccl_exec_once_flag = void*;
 #  define _LIBCUDACXX_EXEC_ONCE_INITIALIZER 0
 
 // Thread ID
-typedef long __cccl_thread_id;
+using __cccl_thread_id = long;
 
 // Thread
 #  define _LIBCUDACXX_NULL_THREAD 0U
 
-typedef void* __cccl_thread_t;
+using __cccl_thread_t = void*;
 
 // Thread Local Storage
-typedef long __cccl_tls_key;
+using __cccl_tls_key = long;
 
 #  define _LIBCUDACXX_TLS_DESTRUCTOR_CC __stdcall
 
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/make_tuple_types.h b/libcudacxx/include/cuda/std/__tuple_dir/make_tuple_types.h
index 6f258caac6d..95b0ace22c7 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/make_tuple_types.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/make_tuple_types.h
@@ -77,13 +77,13 @@ struct __make_tuple_types
 template <class... _Types, size_t _Ep>
 struct __make_tuple_types<tuple<_Types...>, _Ep, 0, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS __tuple_types<_Types...> type;
+  using type _CCCL_NODEBUG_ALIAS = __tuple_types<_Types...>;
 };
 
 template <class... _Types, size_t _Ep>
 struct __make_tuple_types<__tuple_types<_Types...>, _Ep, 0, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS __tuple_types<_Types...> type;
+  using type _CCCL_NODEBUG_ALIAS = __tuple_types<_Types...>;
 };
 
 template <class _Tp, size_t _Ep = tuple_size<remove_reference_t<_Tp>>::value, size_t _Sp = 0>
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h b/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
index 83bb7d809fb..683d4fceda1 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
@@ -118,7 +118,7 @@ struct __tuple_assignable<_Tp, _Up, true, true>
 template <size_t _Ip, class... _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, tuple<_Tp...>>
 {
-  typedef _CCCL_NODEBUG_ALIAS __tuple_element_t<_Ip, __tuple_types<_Tp...>> type;
+  using type _CCCL_NODEBUG_ALIAS = __tuple_element_t<_Ip, __tuple_types<_Tp...>>;
 };
 
 template <bool _IsTuple, class _SizeTrait, size_t _Expected>
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h
index d977d176827..8ace43f87cd 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h
@@ -39,26 +39,26 @@ using __tuple_element_t _CCCL_NODEBUG_ALIAS = typename tuple_element<_Ip, _Tp...
 template <size_t _Ip, class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, const _Tp>
 {
-  typedef _CCCL_NODEBUG_ALIAS typename add_const<__tuple_element_t<_Ip, _Tp>>::type type;
+  using type _CCCL_NODEBUG_ALIAS = typename add_const<__tuple_element_t<_Ip, _Tp>>::type;
 };
 
 template <size_t _Ip, class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, volatile _Tp>
 {
-  typedef _CCCL_NODEBUG_ALIAS typename add_volatile<__tuple_element_t<_Ip, _Tp>>::type type;
+  using type _CCCL_NODEBUG_ALIAS = typename add_volatile<__tuple_element_t<_Ip, _Tp>>::type;
 };
 
 template <size_t _Ip, class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, const volatile _Tp>
 {
-  typedef _CCCL_NODEBUG_ALIAS typename add_cv<__tuple_element_t<_Ip, _Tp>>::type type;
+  using type _CCCL_NODEBUG_ALIAS = typename add_cv<__tuple_element_t<_Ip, _Tp>>::type;
 };
 
 template <size_t _Ip, class... _Types>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, __tuple_types<_Types...>>
 {
   static_assert(_Ip < sizeof...(_Types), "tuple_element index out of range");
-  typedef _CCCL_NODEBUG_ALIAS __type_index_c<_Ip, _Types...> type;
+  using type _CCCL_NODEBUG_ALIAS = __type_index_c<_Ip, _Types...>;
 };
 
 #if _CCCL_STD_VER > 2011
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_indices.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_indices.h
index fd4877d486d..e79456fa094 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_indices.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_indices.h
@@ -29,7 +29,7 @@ template <size_t _Ep, size_t _Sp = 0>
 struct __make_tuple_indices
 {
   static_assert(_Sp <= _Ep, "__make_tuple_indices input error");
-  typedef __make_indices_imp<_Ep, _Sp> type;
+  using type = __make_indices_imp<_Ep, _Sp>;
 };
 
 template <size_t _Ep, size_t _Sp = 0>
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_const.h b/libcudacxx/include/cuda/std/__type_traits/add_const.h
index 2cb6baadb11..69620bd7c2a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_const.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_const.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT add_const
 {
-  typedef _CCCL_NODEBUG_ALIAS const _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = const _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_cv.h b/libcudacxx/include/cuda/std/__type_traits/add_cv.h
index 8f57f3827f4..372f431a540 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_cv.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_cv.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT add_cv
 {
-  typedef _CCCL_NODEBUG_ALIAS const volatile _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = const volatile _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h b/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h
index 8b70295ce14..d4a0e843b63 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h
@@ -34,12 +34,12 @@ using add_lvalue_reference_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_LVALUE_REFE
 template <class _Tp, bool = __cccl_is_referenceable<_Tp>::value>
 struct __add_lvalue_reference_impl
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct __add_lvalue_reference_impl<_Tp, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp& type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp&;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_pointer.h b/libcudacxx/include/cuda/std/__type_traits/add_pointer.h
index 65986787c84..91ef042dffd 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_pointer.h
@@ -37,12 +37,12 @@ using add_pointer_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_POINTER(_Tp);
 template <class _Tp, bool = __cccl_is_referenceable<_Tp>::value || is_void<_Tp>::value>
 struct __add_pointer_impl
 {
-  typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tp>* type;
+  using type _CCCL_NODEBUG_ALIAS = remove_reference_t<_Tp>*;
 };
 template <class _Tp>
 struct __add_pointer_impl<_Tp, false>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h b/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h
index eb9e3f0acdf..f6dd9010f66 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h
@@ -34,12 +34,12 @@ using add_rvalue_reference_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_RVALUE_REFE
 template <class _Tp, bool = __cccl_is_referenceable<_Tp>::value>
 struct __add_rvalue_reference_impl
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct __add_rvalue_reference_impl<_Tp, true>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp&& type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp&&;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_volatile.h b/libcudacxx/include/cuda/std/__type_traits/add_volatile.h
index 40a4e2c7e02..d672c526d02 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_volatile.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_volatile.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT add_volatile
 {
-  typedef _CCCL_NODEBUG_ALIAS volatile _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = volatile _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h b/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h
index 12a153b42ec..601e4a1a23d 100644
--- a/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h
+++ b/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h
@@ -31,7 +31,7 @@ template <class _Tp>
 struct __align_type
 {
   static const size_t value = _LIBCUDACXX_PREFERRED_ALIGNOF(_Tp);
-  typedef _Tp type;
+  using type                = _Tp;
 };
 
 struct __struct_double
@@ -43,17 +43,17 @@ struct __struct_double4
   double __lx[4];
 };
 
-typedef __type_list<__align_type<unsigned char>,
-                    __align_type<unsigned short>,
-                    __align_type<unsigned int>,
-                    __align_type<unsigned long>,
-                    __align_type<unsigned long long>,
-                    __align_type<double>,
-                    __align_type<long double>,
-                    __align_type<__struct_double>,
-                    __align_type<__struct_double4>,
-                    __align_type<int*>>
-  __all_types;
+using __all_types =
+  __type_list<__align_type<unsigned char>,
+              __align_type<unsigned short>,
+              __align_type<unsigned int>,
+              __align_type<unsigned long>,
+              __align_type<unsigned long long>,
+              __align_type<double>,
+              __align_type<long double>,
+              __align_type<__struct_double>,
+              __align_type<__struct_double4>,
+              __align_type<int*>>;
 
 template <size_t _Align>
 struct _CCCL_ALIGNAS(_Align) __fallback_overaligned
@@ -98,7 +98,7 @@ struct __find_max_align : public __type_fold_left<_TL, integral_constant<size_t,
 template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::value>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT aligned_storage
 {
-  typedef typename __find_pod<__all_types, _Align>::type _Aligner;
+  using _Aligner = typename __find_pod<__all_types, _Align>::type;
   union type
   {
     _Aligner __align;
diff --git a/libcudacxx/include/cuda/std/__type_traits/aligned_union.h b/libcudacxx/include/cuda/std/__type_traits/aligned_union.h
index 5d67df1e07b..9a91c5adcdb 100644
--- a/libcudacxx/include/cuda/std/__type_traits/aligned_union.h
+++ b/libcudacxx/include/cuda/std/__type_traits/aligned_union.h
@@ -47,7 +47,7 @@ struct aligned_union
   static const size_t alignment_value =
     __static_max<_LIBCUDACXX_PREFERRED_ALIGNOF(_Type0), _LIBCUDACXX_PREFERRED_ALIGNOF(_Types)...>::value;
   static const size_t __len = __static_max<_Len, sizeof(_Type0), sizeof(_Types)...>::value;
-  typedef typename aligned_storage<__len, alignment_value>::type type;
+  using type                = typename aligned_storage<__len, alignment_value>::type;
 };
 
 template <size_t _Len, class... _Types>
diff --git a/libcudacxx/include/cuda/std/__type_traits/common_type.h b/libcudacxx/include/cuda/std/__type_traits/common_type.h
index d08ed3e9832..22a518b5178 100644
--- a/libcudacxx/include/cuda/std/__type_traits/common_type.h
+++ b/libcudacxx/include/cuda/std/__type_traits/common_type.h
@@ -101,7 +101,7 @@ using __msvc_declval_workaround =
 template <class _Tp, class _Up>
 struct __common_type2_imp<_Tp, _Up, void_t<__cond_type<_Tp, _Up>, __msvc_declval_workaround<_Tp, _Up>>>
 {
-  typedef _CCCL_NODEBUG_ALIAS decay_t<__cond_type<_Tp, _Up>> type;
+  using type _CCCL_NODEBUG_ALIAS = decay_t<__cond_type<_Tp, _Up>>;
 };
 
 template <class, class = void>
@@ -114,7 +114,7 @@ struct __common_types;
 template <class _Tp, class _Up>
 struct __common_type_impl<__common_types<_Tp, _Up>, void_t<common_type_t<_Tp, _Up>>>
 {
-  typedef common_type_t<_Tp, _Up> type;
+  using type = common_type_t<_Tp, _Up>;
 };
 
 template <class _Tp, class _Up, class _Vp, class... _Rest>
diff --git a/libcudacxx/include/cuda/std/__type_traits/conditional.h b/libcudacxx/include/cuda/std/__type_traits/conditional.h
index 18b175e95d4..543679031bc 100644
--- a/libcudacxx/include/cuda/std/__type_traits/conditional.h
+++ b/libcudacxx/include/cuda/std/__type_traits/conditional.h
@@ -45,12 +45,12 @@ using _If _CCCL_NODEBUG_ALIAS = typename _IfImpl<_Cond>::template _Select<_IfRes
 template <bool _Bp, class _If, class _Then>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional
 {
-  typedef _If type;
+  using type = _If;
 };
 template <class _If, class _Then>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional<false, _If, _Then>
 {
-  typedef _Then type;
+  using type = _Then;
 };
 
 template <bool _Bp, class _If, class _Then>
diff --git a/libcudacxx/include/cuda/std/__type_traits/decay.h b/libcudacxx/include/cuda/std/__type_traits/decay.h
index b8d5a744cfd..138f0cc44bc 100644
--- a/libcudacxx/include/cuda/std/__type_traits/decay.h
+++ b/libcudacxx/include/cuda/std/__type_traits/decay.h
@@ -46,27 +46,27 @@ using decay_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_DECAY(_Tp);
 template <class _Up, bool>
 struct __decay_impl
 {
-  typedef _CCCL_NODEBUG_ALIAS remove_cv_t<_Up> type;
+  using type _CCCL_NODEBUG_ALIAS = remove_cv_t<_Up>;
 };
 
 template <class _Up>
 struct __decay_impl<_Up, true>
 {
 public:
-  typedef _CCCL_NODEBUG_ALIAS conditional_t<is_array<_Up>::value,
-                                            remove_extent_t<_Up>*,
-                                            conditional_t<is_function<_Up>::value, add_pointer_t<_Up>, remove_cv_t<_Up>>>
-    type;
+  using type _CCCL_NODEBUG_ALIAS =
+    conditional_t<is_array<_Up>::value,
+                  remove_extent_t<_Up>*,
+                  conditional_t<is_function<_Up>::value, add_pointer_t<_Up>, remove_cv_t<_Up>>>;
 };
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT decay
 {
 private:
-  typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tp> _Up;
+  using _Up _CCCL_NODEBUG_ALIAS = remove_reference_t<_Tp>;
 
 public:
-  typedef _CCCL_NODEBUG_ALIAS typename __decay_impl<_Up, __cccl_is_referenceable<_Up>::value>::type type;
+  using type _CCCL_NODEBUG_ALIAS = typename __decay_impl<_Up, __cccl_is_referenceable<_Up>::value>::type;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/enable_if.h b/libcudacxx/include/cuda/std/__type_traits/enable_if.h
index fe5c95e0be0..564959fe095 100644
--- a/libcudacxx/include/cuda/std/__type_traits/enable_if.h
+++ b/libcudacxx/include/cuda/std/__type_traits/enable_if.h
@@ -28,7 +28,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT enable_if
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT enable_if<true, _Tp>
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <bool _Bp, class _Tp = void>
diff --git a/libcudacxx/include/cuda/std/__type_traits/integral_constant.h b/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
index 7f37fea39cb..7a361873184 100644
--- a/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
+++ b/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
@@ -26,8 +26,8 @@ template <class _Tp, _Tp __v>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT integral_constant
 {
   static constexpr const _Tp value = __v;
-  typedef _Tp value_type;
-  typedef integral_constant type;
+  using value_type                 = _Tp;
+  using type                       = integral_constant;
   _LIBCUDACXX_HIDE_FROM_ABI constexpr operator value_type() const noexcept
   {
     return value;
@@ -41,8 +41,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT integral_constant
 template <class _Tp, _Tp __v>
 constexpr const _Tp integral_constant<_Tp, __v>::value;
 
-typedef integral_constant<bool, true> true_type;
-typedef integral_constant<bool, false> false_type;
+using true_type  = integral_constant<bool, true>;
+using false_type = integral_constant<bool, false>;
 
 template <bool _Val>
 using _BoolConstant _LIBCUDACXX_DEPRECATED _CCCL_NODEBUG_ALIAS = integral_constant<bool, _Val>;
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_assignable.h
index db3f95d9a0c..cd7f6bb9b67 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_assignable.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename, typename _Tp>
 struct __select_2nd
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 
 #if defined(_CCCL_BUILTIN_IS_ASSIGNABLE) && !defined(_LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_base_of.h b/libcudacxx/include/cuda/std/__type_traits/is_base_of.h
index 1dbed5d8250..33fe61668b6 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_base_of.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_base_of.h
@@ -58,7 +58,7 @@ struct _Src
 template <size_t>
 struct __one
 {
-  typedef char type;
+  using type = char;
 };
 template <class _Bp, class _Dp>
 _CCCL_HOST_DEVICE typename __one<sizeof(_Dst<_Bp>(_CUDA_VSTD::declval<_Src<_Dp>>()))>::type __test(int);
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
index 579c45c0295..8e588272b2c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
@@ -125,7 +125,7 @@ template <class _Tp, class... _Args>
 struct __cccl_is_constructible
 {
   static_assert(sizeof...(_Args) > 1, "Wrong specialization");
-  typedef decltype(__is_constructible_helper::__test_nary<_Tp, _Args...>(0)) type;
+  using type = decltype(__is_constructible_helper::__test_nary<_Tp, _Args...>(0));
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_destructible.h
index a6ccc696bc5..f507f6b1f5a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_destructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_destructible.h
@@ -52,7 +52,7 @@ _CCCL_INLINE_VAR constexpr bool is_destructible_v = _CCCL_BUILTIN_IS_DESTRUCTIBL
 template <class>
 struct __is_destructible_apply
 {
-  typedef int type;
+  using type = int;
 };
 
 template <typename _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
index bb1afa4225b..b9700a87066 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
@@ -33,6 +33,10 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
 _CCCL_DIAG_POP
 #endif // _LIBCUDACXX_HAS_NVBF16
 
+#if _CCCL_HAS_NVFP8()
+#  include <cuda_fp8.h>
+#endif // _CCCL_HAS_NVFP8()
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
@@ -71,6 +75,22 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_bfloat16> =
 #  endif // !_CCCL_NO_INLINE_VARIABLES
 #endif // _LIBCUDACXX_HAS_NVBF16
 
+#if _CCCL_HAS_NVFP8()
+template <>
+struct __is_extended_floating_point<__nv_fp8_e4m3> : true_type
+{};
+template <>
+struct __is_extended_floating_point<__nv_fp8_e5m2> : true_type
+{};
+
+#  ifndef _CCCL_NO_INLINE_VARIABLES
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_fp8_e4m3> = true;
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_fp8_e5m2> = true;
+#  endif // !_CCCL_NO_INLINE_VARIABLES
+#endif // _CCCL_HAS_NVFP8()
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_EXTENDED_FLOATING_POINT_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
index 33087693c71..037c0a49bbc 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
@@ -127,8 +127,8 @@ struct __swappable_with
   _LIBCUDACXX_HIDE_FROM_ABI static __nat __test_swap(long);
 
   // Extra parens are needed for the C++03 definition of decltype.
-  typedef decltype((__test_swap<_Tp, _Up>(0))) __swap1;
-  typedef decltype((__test_swap<_Up, _Tp>(0))) __swap2;
+  using __swap1 = decltype((__test_swap<_Tp, _Up>(0)));
+  using __swap2 = decltype((__test_swap<_Up, _Tp>(0)));
 
   static const bool value = _IsNotSame<__swap1, __nat>::value && _IsNotSame<__swap2, __nat>::value;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/make_signed.h b/libcudacxx/include/cuda/std/__type_traits/make_signed.h
index f28abf2ad66..489fe7cbc4e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/make_signed.h
+++ b/libcudacxx/include/cuda/std/__type_traits/make_signed.h
@@ -36,17 +36,17 @@ template <class _Tp>
 using make_signed_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_MAKE_SIGNED(_Tp);
 
 #else
-typedef __type_list<signed char,
-                    signed short,
-                    signed int,
-                    signed long,
-                    signed long long
+using __signed_types =
+  __type_list<signed char,
+              signed short,
+              signed int,
+              signed long,
+              signed long long
 #  ifndef _LIBCUDACXX_HAS_NO_INT128
-                    ,
-                    __int128_t
+              ,
+              __int128_t
 #  endif
-                    >
-  __signed_types;
+              >;
 
 template <class _Tp, bool = is_integral<_Tp>::value || is_enum<_Tp>::value>
 struct __make_signed_impl
@@ -70,53 +70,53 @@ struct __make_signed_impl<bool, true>
 template <>
 struct __make_signed_impl<signed short, true>
 {
-  typedef short type;
+  using type = short;
 };
 template <>
 struct __make_signed_impl<unsigned short, true>
 {
-  typedef short type;
+  using type = short;
 };
 template <>
 struct __make_signed_impl<signed int, true>
 {
-  typedef int type;
+  using type = int;
 };
 template <>
 struct __make_signed_impl<unsigned int, true>
 {
-  typedef int type;
+  using type = int;
 };
 template <>
 struct __make_signed_impl<signed long, true>
 {
-  typedef long type;
+  using type = long;
 };
 template <>
 struct __make_signed_impl<unsigned long, true>
 {
-  typedef long type;
+  using type = long;
 };
 template <>
 struct __make_signed_impl<signed long long, true>
 {
-  typedef long long type;
+  using type = long long;
 };
 template <>
 struct __make_signed_impl<unsigned long long, true>
 {
-  typedef long long type;
+  using type = long long;
 };
 #  ifndef _LIBCUDACXX_HAS_NO_INT128
 template <>
 struct __make_signed_impl<__int128_t, true>
 {
-  typedef __int128_t type;
+  using type = __int128_t;
 };
 template <>
 struct __make_signed_impl<__uint128_t, true>
 {
-  typedef __int128_t type;
+  using type = __int128_t;
 };
 #  endif // !_LIBCUDACXX_HAS_NO_INT128
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h b/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h
index aba83b1ff11..372e045bc0e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h
+++ b/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h
@@ -38,17 +38,17 @@ template <class _Tp>
 using make_unsigned_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_MAKE_UNSIGNED(_Tp);
 
 #else
-typedef __type_list<unsigned char,
-                    unsigned short,
-                    unsigned int,
-                    unsigned long,
-                    unsigned long long
+using __unsigned_types =
+  __type_list<unsigned char,
+              unsigned short,
+              unsigned int,
+              unsigned long,
+              unsigned long long
 #  ifndef _LIBCUDACXX_HAS_NO_INT128
-                    ,
-                    __uint128_t
+              ,
+              __uint128_t
 #  endif
-                    >
-  __unsigned_types;
+              >;
 
 template <class _Tp, bool = is_integral<_Tp>::value || is_enum<_Tp>::value>
 struct __make_unsigned_impl
@@ -72,53 +72,53 @@ struct __make_unsigned_impl<bool, true>
 template <>
 struct __make_unsigned_impl<signed short, true>
 {
-  typedef unsigned short type;
+  using type = unsigned short;
 };
 template <>
 struct __make_unsigned_impl<unsigned short, true>
 {
-  typedef unsigned short type;
+  using type = unsigned short;
 };
 template <>
 struct __make_unsigned_impl<signed int, true>
 {
-  typedef unsigned int type;
+  using type = unsigned int;
 };
 template <>
 struct __make_unsigned_impl<unsigned int, true>
 {
-  typedef unsigned int type;
+  using type = unsigned int;
 };
 template <>
 struct __make_unsigned_impl<signed long, true>
 {
-  typedef unsigned long type;
+  using type = unsigned long;
 };
 template <>
 struct __make_unsigned_impl<unsigned long, true>
 {
-  typedef unsigned long type;
+  using type = unsigned long;
 };
 template <>
 struct __make_unsigned_impl<signed long long, true>
 {
-  typedef unsigned long long type;
+  using type = unsigned long long;
 };
 template <>
 struct __make_unsigned_impl<unsigned long long, true>
 {
-  typedef unsigned long long type;
+  using type = unsigned long long;
 };
 #  ifndef _LIBCUDACXX_HAS_NO_INT128
 template <>
 struct __make_unsigned_impl<__int128_t, true>
 {
-  typedef __uint128_t type;
+  using type = __uint128_t;
 };
 template <>
 struct __make_unsigned_impl<__uint128_t, true>
 {
-  typedef __uint128_t type;
+  using type = __uint128_t;
 };
 #  endif // !_LIBCUDACXX_HAS_NO_INT128
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/promote.h b/libcudacxx/include/cuda/std/__type_traits/promote.h
index 18a5afacfef..c13f3913ec0 100644
--- a/libcudacxx/include/cuda/std/__type_traits/promote.h
+++ b/libcudacxx/include/cuda/std/__type_traits/promote.h
@@ -59,7 +59,7 @@ struct __numeric_type
   _LIBCUDACXX_HIDE_FROM_ABI static double __test(double);
   _LIBCUDACXX_HIDE_FROM_ABI static long double __test(long double);
 
-  typedef decltype(__test(declval<_Tp>())) type;
+  using type              = decltype(__test(declval<_Tp>()));
   static const bool value = _IsNotSame<type, void>::value;
 };
 
@@ -128,12 +128,12 @@ template <class _A1, class _A2, class _A3>
 class __promote_imp<_A1, _A2, _A3, true>
 {
 private:
-  typedef typename __promote_imp<_A1>::type __type1;
-  typedef typename __promote_imp<_A2>::type __type2;
-  typedef typename __promote_imp<_A3>::type __type3;
+  using __type1 = typename __promote_imp<_A1>::type;
+  using __type2 = typename __promote_imp<_A2>::type;
+  using __type3 = typename __promote_imp<_A3>::type;
 
 public:
-  typedef decltype(__type1() + __type2() + __type3()) type;
+  using type              = decltype(__type1() + __type2() + __type3());
   static const bool value = true;
 };
 
@@ -145,7 +145,7 @@ class __promote_imp<_A1, _A2, void, true>
   using __type2 = typename __promote_imp<_A2>::type;
 
 public:
-  typedef decltype(__type1() + __type2()) type;
+  using type              = decltype(__type1() + __type2());
   static const bool value = true;
 };
 
@@ -153,7 +153,7 @@ template <class _A1>
 class __promote_imp<_A1, void, void, true>
 {
 public:
-  typedef typename __numeric_type<_A1>::type type;
+  using type              = typename __numeric_type<_A1>::type;
   static const bool value = true;
 };
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h b/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h
index ec8161a7b66..942df9549d9 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h
@@ -39,17 +39,17 @@ using remove_all_extents_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_REMOVE_ALL_EXTENT
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_all_extents
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_all_extents<_Tp[]>
 {
-  typedef typename remove_all_extents<_Tp>::type type;
+  using type = typename remove_all_extents<_Tp>::type;
 };
 template <class _Tp, size_t _Np>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_all_extents<_Tp[_Np]>
 {
-  typedef typename remove_all_extents<_Tp>::type type;
+  using type = typename remove_all_extents<_Tp>::type;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_const.h b/libcudacxx/include/cuda/std/__type_traits/remove_const.h
index 7829d46a245..b736b6af250 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_const.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_const.h
@@ -37,12 +37,12 @@ using remove_const_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_REMOVE_CONST(_Tp);
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_const
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_const<const _Tp>
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_cv.h b/libcudacxx/include/cuda/std/__type_traits/remove_cv.h
index cf99d2d470e..0a39c1bafd6 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_cv.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_cv.h
@@ -40,7 +40,7 @@ using remove_cv_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_REMOVE_CV(_Tp);
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_cv
 {
-  typedef remove_volatile_t<remove_const_t<_Tp>> type;
+  using type = remove_volatile_t<remove_const_t<_Tp>>;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_extent.h b/libcudacxx/include/cuda/std/__type_traits/remove_extent.h
index e031d7f3746..ccef312c239 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_extent.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_extent.h
@@ -38,17 +38,17 @@ using remove_extent_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_REMOVE_EXTENT(_Tp);
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_extent
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_extent<_Tp[]>
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 template <class _Tp, size_t _Np>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_extent<_Tp[_Np]>
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h b/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h
index 4123d1a8bdb..a192790fe36 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h
@@ -36,27 +36,27 @@ using remove_pointer_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_REMOVE_POINTER(_Tp);
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp*>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp* const>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp* volatile>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp* const volatile>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_reference.h b/libcudacxx/include/cuda/std/__type_traits/remove_reference.h
index 050b06b1992..76cec049d4c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_reference.h
@@ -45,17 +45,17 @@ using remove_reference_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_REMOVE_REFERENCE_T(
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_reference
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_reference<_Tp&>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_reference<_Tp&&>
 {
-  typedef _CCCL_NODEBUG_ALIAS _Tp type;
+  using type _CCCL_NODEBUG_ALIAS = _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h b/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h
index 8f5ae0c87cc..53ca15e599b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h
@@ -36,12 +36,12 @@ using remove_volatile_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_REMOVE_VOLATILE(_Tp)
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_volatile
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_volatile<volatile _Tp>
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/type_identity.h b/libcudacxx/include/cuda/std/__type_traits/type_identity.h
index a436c4abc84..6639993482b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/type_identity.h
+++ b/libcudacxx/include/cuda/std/__type_traits/type_identity.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct type_identity
 {
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/underlying_type.h b/libcudacxx/include/cuda/std/__type_traits/underlying_type.h
index ae3fed4744d..ce56506cb84 100644
--- a/libcudacxx/include/cuda/std/__type_traits/underlying_type.h
+++ b/libcudacxx/include/cuda/std/__type_traits/underlying_type.h
@@ -36,7 +36,7 @@ struct __underlying_type_impl<_Tp, false>
 template <class _Tp>
 struct __underlying_type_impl<_Tp, true>
 {
-  typedef _CCCL_BUILTIN_UNDERLYING_TYPE(_Tp) type;
+  using type = _CCCL_BUILTIN_UNDERLYING_TYPE(_Tp);
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__utility/convert_to_integral.h b/libcudacxx/include/cuda/std/__utility/convert_to_integral.h
index 42d6a98a111..3f0adffd348 100644
--- a/libcudacxx/include/cuda/std/__utility/convert_to_integral.h
+++ b/libcudacxx/include/cuda/std/__utility/convert_to_integral.h
@@ -79,8 +79,8 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr __uint128_t __convert_to_integral(__uint128_
 template <class _Tp, bool = is_enum<_Tp>::value>
 struct __sfinae_underlying_type
 {
-  typedef typename underlying_type<_Tp>::type type;
-  typedef decltype(((type) 1) + 0) __promoted_type;
+  using type            = typename underlying_type<_Tp>::type;
+  using __promoted_type = decltype(((type) 1) + 0);
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__utility/integer_sequence.h b/libcudacxx/include/cuda/std/__utility/integer_sequence.h
index 6f74471b399..e98e0970b9c 100644
--- a/libcudacxx/include/cuda/std/__utility/integer_sequence.h
+++ b/libcudacxx/include/cuda/std/__utility/integer_sequence.h
@@ -61,7 +61,7 @@ struct __repeat;
 template <typename _Tp, _Tp... _Np, size_t... _Extra>
 struct __repeat<__integer_sequence<_Tp, _Np...>, _Extra...>
 {
-  typedef _CCCL_NODEBUG_ALIAS __integer_sequence<
+  using type _CCCL_NODEBUG_ALIAS = __integer_sequence<
     _Tp,
     _Np...,
     sizeof...(_Np) + _Np...,
@@ -71,8 +71,7 @@ struct __repeat<__integer_sequence<_Tp, _Np...>, _Extra...>
     5 * sizeof...(_Np) + _Np...,
     6 * sizeof...(_Np) + _Np...,
     7 * sizeof...(_Np) + _Np...,
-    _Extra...>
-    type;
+    _Extra...>;
 };
 
 template <size_t _Np>
@@ -85,42 +84,42 @@ struct __make : __parity<_Np % 8>::template __pmake<_Np>
 template <>
 struct __make<0>
 {
-  typedef __integer_sequence<size_t> type;
+  using type = __integer_sequence<size_t>;
 };
 template <>
 struct __make<1>
 {
-  typedef __integer_sequence<size_t, 0> type;
+  using type = __integer_sequence<size_t, 0>;
 };
 template <>
 struct __make<2>
 {
-  typedef __integer_sequence<size_t, 0, 1> type;
+  using type = __integer_sequence<size_t, 0, 1>;
 };
 template <>
 struct __make<3>
 {
-  typedef __integer_sequence<size_t, 0, 1, 2> type;
+  using type = __integer_sequence<size_t, 0, 1, 2>;
 };
 template <>
 struct __make<4>
 {
-  typedef __integer_sequence<size_t, 0, 1, 2, 3> type;
+  using type = __integer_sequence<size_t, 0, 1, 2, 3>;
 };
 template <>
 struct __make<5>
 {
-  typedef __integer_sequence<size_t, 0, 1, 2, 3, 4> type;
+  using type = __integer_sequence<size_t, 0, 1, 2, 3, 4>;
 };
 template <>
 struct __make<6>
 {
-  typedef __integer_sequence<size_t, 0, 1, 2, 3, 4, 5> type;
+  using type = __integer_sequence<size_t, 0, 1, 2, 3, 4, 5>;
 };
 template <>
 struct __make<7>
 {
-  typedef __integer_sequence<size_t, 0, 1, 2, 3, 4, 5, 6> type;
+  using type = __integer_sequence<size_t, 0, 1, 2, 3, 4, 5, 6>;
 };
 
 template <>
@@ -192,7 +191,7 @@ using __make_indices_imp _CCCL_NODEBUG_ALIAS =
 template <class _Tp, _Tp... _Ip>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT integer_sequence
 {
-  typedef _Tp value_type;
+  using value_type = _Tp;
   static_assert(is_integral<_Tp>::value, "std::integer_sequence can only be instantiated with an integral type");
   static _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t size() noexcept
   {
@@ -226,7 +225,7 @@ struct __make_integer_sequence_checked
   static_assert(0 <= _Ep, "std::make_integer_sequence must have a non-negative sequence length");
   // Workaround GCC bug by preventing bad installations when 0 <= _Ep
   // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68929
-  typedef _CCCL_NODEBUG_ALIAS __make_integer_sequence_unchecked<_Tp, 0 <= _Ep ? _Ep : 0> type;
+  using type _CCCL_NODEBUG_ALIAS = __make_integer_sequence_unchecked<_Tp, 0 <= _Ep ? _Ep : 0>;
 };
 
 template <class _Tp, _Tp _Ep>
diff --git a/libcudacxx/include/cuda/std/__utility/move.h b/libcudacxx/include/cuda/std/__utility/move.h
index ece4a37e1f6..ac8fcb1ec56 100644
--- a/libcudacxx/include/cuda/std/__utility/move.h
+++ b/libcudacxx/include/cuda/std/__utility/move.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr remove_reference_t<_Tp>&& move(_Tp&& __t) noexcept
 {
-  typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tp> _Up;
+  using _Up _CCCL_NODEBUG_ALIAS = remove_reference_t<_Tp>;
   return static_cast<_Up&&>(__t);
 }
 
diff --git a/libcudacxx/include/cuda/std/__utility/pair.h b/libcudacxx/include/cuda/std/__utility/pair.h
index f8d85ee4e3d..e725cf4b001 100644
--- a/libcudacxx/include/cuda/std/__utility/pair.h
+++ b/libcudacxx/include/cuda/std/__utility/pair.h
@@ -224,8 +224,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT pair : public __pair_base<_T1, _T2>
 {
   using __base = __pair_base<_T1, _T2>;
 
-  typedef _T1 first_type;
-  typedef _T2 second_type;
+  using first_type  = _T1;
+  using second_type = _T2;
 
   template <class _Constraints                                               = __pair_constraints<_T1, _T2>,
             enable_if_t<_Constraints::__explicit_default_constructible, int> = 0>
@@ -645,13 +645,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, pair<_T1, _T2>>
 template <class _T1, class _T2>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<0, pair<_T1, _T2>>
 {
-  typedef _CCCL_NODEBUG_ALIAS _T1 type;
+  using type _CCCL_NODEBUG_ALIAS = _T1;
 };
 
 template <class _T1, class _T2>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<1, pair<_T1, _T2>>
 {
-  typedef _CCCL_NODEBUG_ALIAS _T2 type;
+  using type _CCCL_NODEBUG_ALIAS = _T2;
 };
 
 template <size_t _Ip>
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 0c0ededd8a4..bd4bdba3788 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -761,60 +761,60 @@ _LIBCUDACXX_HIDE_FROM_ABI void atomic_signal_fence(memory_order __m) noexcept
 
 // Atomics for standard typedef types
 
-typedef atomic<bool> atomic_bool;
-typedef atomic<char> atomic_char;
-typedef atomic<signed char> atomic_schar;
-typedef atomic<unsigned char> atomic_uchar;
-typedef atomic<short> atomic_short;
-typedef atomic<unsigned short> atomic_ushort;
-typedef atomic<int> atomic_int;
-typedef atomic<unsigned int> atomic_uint;
-typedef atomic<long> atomic_long;
-typedef atomic<unsigned long> atomic_ulong;
-typedef atomic<long long> atomic_llong;
-typedef atomic<unsigned long long> atomic_ullong;
-typedef atomic<char16_t> atomic_char16_t;
-typedef atomic<char32_t> atomic_char32_t;
-typedef atomic<wchar_t> atomic_wchar_t;
-
-typedef atomic<int_least8_t> atomic_int_least8_t;
-typedef atomic<uint_least8_t> atomic_uint_least8_t;
-typedef atomic<int_least16_t> atomic_int_least16_t;
-typedef atomic<uint_least16_t> atomic_uint_least16_t;
-typedef atomic<int_least32_t> atomic_int_least32_t;
-typedef atomic<uint_least32_t> atomic_uint_least32_t;
-typedef atomic<int_least64_t> atomic_int_least64_t;
-typedef atomic<uint_least64_t> atomic_uint_least64_t;
-
-typedef atomic<int_fast8_t> atomic_int_fast8_t;
-typedef atomic<uint_fast8_t> atomic_uint_fast8_t;
-typedef atomic<int_fast16_t> atomic_int_fast16_t;
-typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t> atomic_int_fast32_t;
-typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
-typedef atomic<int_fast64_t> atomic_int_fast64_t;
-typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
-
-typedef atomic<int8_t> atomic_int8_t;
-typedef atomic<uint8_t> atomic_uint8_t;
-typedef atomic<int16_t> atomic_int16_t;
-typedef atomic<uint16_t> atomic_uint16_t;
-typedef atomic<int32_t> atomic_int32_t;
-typedef atomic<uint32_t> atomic_uint32_t;
-typedef atomic<int64_t> atomic_int64_t;
-typedef atomic<uint64_t> atomic_uint64_t;
-
-typedef atomic<intptr_t> atomic_intptr_t;
-typedef atomic<uintptr_t> atomic_uintptr_t;
-typedef atomic<size_t> atomic_size_t;
-typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
-typedef atomic<intmax_t> atomic_intmax_t;
-typedef atomic<uintmax_t> atomic_uintmax_t;
+using atomic_bool     = atomic<bool>;
+using atomic_char     = atomic<char>;
+using atomic_schar    = atomic<signed char>;
+using atomic_uchar    = atomic<unsigned char>;
+using atomic_short    = atomic<short>;
+using atomic_ushort   = atomic<unsigned short>;
+using atomic_int      = atomic<int>;
+using atomic_uint     = atomic<unsigned int>;
+using atomic_long     = atomic<long>;
+using atomic_ulong    = atomic<unsigned long>;
+using atomic_llong    = atomic<long long>;
+using atomic_ullong   = atomic<unsigned long long>;
+using atomic_char16_t = atomic<char16_t>;
+using atomic_char32_t = atomic<char32_t>;
+using atomic_wchar_t  = atomic<wchar_t>;
+
+using atomic_int_least8_t   = atomic<int_least8_t>;
+using atomic_uint_least8_t  = atomic<uint_least8_t>;
+using atomic_int_least16_t  = atomic<int_least16_t>;
+using atomic_uint_least16_t = atomic<uint_least16_t>;
+using atomic_int_least32_t  = atomic<int_least32_t>;
+using atomic_uint_least32_t = atomic<uint_least32_t>;
+using atomic_int_least64_t  = atomic<int_least64_t>;
+using atomic_uint_least64_t = atomic<uint_least64_t>;
+
+using atomic_int_fast8_t   = atomic<int_fast8_t>;
+using atomic_uint_fast8_t  = atomic<uint_fast8_t>;
+using atomic_int_fast16_t  = atomic<int_fast16_t>;
+using atomic_uint_fast16_t = atomic<uint_fast16_t>;
+using atomic_int_fast32_t  = atomic<int_fast32_t>;
+using atomic_uint_fast32_t = atomic<uint_fast32_t>;
+using atomic_int_fast64_t  = atomic<int_fast64_t>;
+using atomic_uint_fast64_t = atomic<uint_fast64_t>;
+
+using atomic_int8_t   = atomic<int8_t>;
+using atomic_uint8_t  = atomic<uint8_t>;
+using atomic_int16_t  = atomic<int16_t>;
+using atomic_uint16_t = atomic<uint16_t>;
+using atomic_int32_t  = atomic<int32_t>;
+using atomic_uint32_t = atomic<uint32_t>;
+using atomic_int64_t  = atomic<int64_t>;
+using atomic_uint64_t = atomic<uint64_t>;
+
+using atomic_intptr_t  = atomic<intptr_t>;
+using atomic_uintptr_t = atomic<uintptr_t>;
+using atomic_size_t    = atomic<size_t>;
+using atomic_ptrdiff_t = atomic<ptrdiff_t>;
+using atomic_intmax_t  = atomic<intmax_t>;
+using atomic_uintmax_t = atomic<uintmax_t>;
 
 static_assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
 
-typedef atomic<int> atomic_signed_lock_free;
-typedef atomic<unsigned> atomic_unsigned_lock_free;
+using atomic_signed_lock_free   = atomic<int>;
+using atomic_unsigned_lock_free = atomic<unsigned>;
 
 #define LIBCUDACXX_ATOMIC_FLAG_INIT     {false}
 #define LIBCUDACXX_ATOMIC_VAR_INIT(__v) {__v}
diff --git a/libcudacxx/include/cuda/std/bitset b/libcudacxx/include/cuda/std/bitset
index 60d0e912c80..a385c12251d 100644
--- a/libcudacxx/include/cuda/std/bitset
+++ b/libcudacxx/include/cuda/std/bitset
@@ -184,14 +184,14 @@ template <size_t _N_words, size_t _Size>
 class __bitset
 {
 public:
-  typedef ptrdiff_t difference_type;
-  typedef size_t size_type;
-  typedef __avoid_promotions<uint32_t> __storage_type;
+  using difference_type = ptrdiff_t;
+  using size_type       = size_t;
+  using __storage_type  = __avoid_promotions<uint32_t>;
 
 protected:
-  typedef __bitset __self;
-  typedef __storage_type* __storage_pointer;
-  typedef const __storage_type* __const_storage_pointer;
+  using __self                          = __bitset;
+  using __storage_pointer               = __storage_type*;
+  using __const_storage_pointer         = const __storage_type*;
   static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
 
   friend class __bit_reference<__bitset>;
@@ -202,10 +202,10 @@ protected:
 
   __storage_type __first_[_N_words];
 
-  typedef __bit_reference<__bitset> reference;
-  typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
+  using reference       = __bit_reference<__bitset>;
+  using const_reference = __bit_const_reference<__bitset>;
+  using iterator        = __bit_iterator<__bitset, false>;
+  using const_iterator  = __bit_iterator<__bitset, true>;
 
   _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __clip_top_word_to_size(unsigned long long __v)
   {
@@ -436,15 +436,15 @@ template <size_t _Size>
 class __bitset<1, _Size>
 {
 public:
-  typedef ptrdiff_t difference_type;
-  typedef size_t size_type;
-  typedef __avoid_promotions<conditional_t<_Size <= 8, uint8_t, conditional_t<_Size <= 16, uint16_t, uint32_t>>>
-    __storage_type;
+  using difference_type = ptrdiff_t;
+  using size_type       = size_t;
+  using __storage_type =
+    __avoid_promotions<conditional_t<_Size <= 8, uint8_t, conditional_t<_Size <= 16, uint16_t, uint32_t>>>;
 
 protected:
-  typedef __bitset __self;
-  typedef __storage_type* __storage_pointer;
-  typedef const __storage_type* __const_storage_pointer;
+  using __self                          = __bitset;
+  using __storage_pointer               = __storage_type*;
+  using __const_storage_pointer         = const __storage_type*;
   static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
 
   friend class __bit_reference<__bitset>;
@@ -455,10 +455,10 @@ protected:
 
   __storage_type __first_;
 
-  typedef __bit_reference<__bitset> reference;
-  typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
+  using reference       = __bit_reference<__bitset>;
+  using const_reference = __bit_const_reference<__bitset>;
+  using iterator        = __bit_iterator<__bitset, false>;
+  using const_iterator  = __bit_iterator<__bitset, true>;
 
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __bitset() noexcept
       : __first_(0)
@@ -564,14 +564,14 @@ template <>
 class __bitset<0, 0>
 {
 public:
-  typedef ptrdiff_t difference_type;
-  typedef size_t size_type;
-  typedef __avoid_promotions<uint32_t> __storage_type;
+  using difference_type = ptrdiff_t;
+  using size_type       = size_t;
+  using __storage_type  = __avoid_promotions<uint32_t>;
 
 protected:
-  typedef __bitset __self;
-  typedef __storage_type* __storage_pointer;
-  typedef const __storage_type* __const_storage_pointer;
+  using __self                          = __bitset;
+  using __storage_pointer               = __storage_type*;
+  using __const_storage_pointer         = const __storage_type*;
   static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
 
   friend class __bit_reference<__bitset>;
@@ -580,10 +580,10 @@ protected:
   friend class __bit_iterator<__bitset, true>;
   friend struct __bit_array<__bitset>;
 
-  typedef __bit_reference<__bitset> reference;
-  typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
+  using reference       = __bit_reference<__bitset>;
+  using const_reference = __bit_const_reference<__bitset>;
+  using iterator        = __bit_iterator<__bitset, false>;
+  using const_iterator  = __bit_iterator<__bitset, true>;
 
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __bitset() noexcept {}
   _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __bitset(unsigned long long) noexcept {}
@@ -662,11 +662,11 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT bitset : private __bitset<_Size == 0 ? 0 : (
 {
 public:
   static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / 32 + 1;
-  typedef __bitset<__n_words, _Size> base;
+  using base                      = __bitset<__n_words, _Size>;
 
 public:
-  typedef typename base::reference reference;
-  typedef typename base::const_reference const_reference;
+  using reference       = typename base::reference;
+  using const_reference = typename base::const_reference;
 
   // 23.3.5.1 constructors:
   _LIBCUDACXX_HIDE_FROM_ABI constexpr bitset() noexcept {}
diff --git a/libcudacxx/include/cuda/std/cstdint b/libcudacxx/include/cuda/std/cstdint
index 3f923fdb02b..0ff7efc49f1 100644
--- a/libcudacxx/include/cuda/std/cstdint
+++ b/libcudacxx/include/cuda/std/cstdint
@@ -21,9 +21,149 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/version>
+
 _CCCL_PUSH_MACROS
 
-#include <cuda/std/detail/libcxx/include/cstdint>
+#if !_CCCL_COMPILER(NVRTC)
+#  include <cstdint>
+#else // ^^^ !_CCCL_COMPILER(NVRTC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv
+#  include <cuda/std/climits>
+
+using int8_t   = signed char;
+using int16_t  = signed short;
+using int32_t  = signed int;
+using int64_t  = signed long long;
+using uint8_t  = unsigned char;
+using uint16_t = unsigned short;
+using uint32_t = unsigned int;
+using uint64_t = unsigned long long;
+
+using int_fast8_t   = int8_t;
+using int_fast16_t  = int16_t;
+using int_fast32_t  = int32_t;
+using int_fast64_t  = int64_t;
+using uint_fast8_t  = uint8_t;
+using uint_fast16_t = uint16_t;
+using uint_fast32_t = uint32_t;
+using uint_fast64_t = uint64_t;
+
+using int_least8_t   = int8_t;
+using int_least16_t  = int16_t;
+using int_least32_t  = int32_t;
+using int_least64_t  = int64_t;
+using uint_least8_t  = uint8_t;
+using uint_least16_t = uint16_t;
+using uint_least32_t = uint32_t;
+using uint_least64_t = uint64_t;
+
+using intptr_t  = int64_t;
+using uintptr_t = uint64_t;
+
+using intmax_t  = int64_t;
+using uintmax_t = uint64_t;
+
+#  define INT8_MIN   SCHAR_MIN
+#  define INT16_MIN  SHRT_MIN
+#  define INT32_MIN  INT_MIN
+#  define INT64_MIN  LLONG_MIN
+#  define INT8_MAX   SCHAR_MAX
+#  define INT16_MAX  SHRT_MAX
+#  define INT32_MAX  INT_MAX
+#  define INT64_MAX  LLONG_MAX
+#  define UINT8_MAX  UCHAR_MAX
+#  define UINT16_MAX USHRT_MAX
+#  define UINT32_MAX UINT_MAX
+#  define UINT64_MAX ULLONG_MAX
+
+#  define INT_FAST8_MIN   INT8_MIN
+#  define INT_FAST16_MIN  INT16_MIN
+#  define INT_FAST32_MIN  INT32_MIN
+#  define INT_FAST64_MIN  INT64_MIN
+#  define INT_FAST8_MAX   INT8_MAX
+#  define INT_FAST16_MAX  INT16_MAX
+#  define INT_FAST32_MAX  INT32_MAX
+#  define INT_FAST64_MAX  INT64_MAX
+#  define UINT_FAST8_MAX  UINT8_MAX
+#  define UINT_FAST16_MAX UINT16_MAX
+#  define UINT_FAST32_MAX UINT32_MAX
+#  define UINT_FAST64_MAX UINT64_MAX
+
+#  define INT_LEAST8_MIN   INT8_MIN
+#  define INT_LEAST16_MIN  INT16_MIN
+#  define INT_LEAST32_MIN  INT32_MIN
+#  define INT_LEAST64_MIN  INT64_MIN
+#  define INT_LEAST8_MAX   INT8_MAX
+#  define INT_LEAST16_MAX  INT16_MAX
+#  define INT_LEAST32_MAX  INT32_MAX
+#  define INT_LEAST64_MAX  INT64_MAX
+#  define UINT_LEAST8_MAX  UINT8_MAX
+#  define UINT_LEAST16_MAX UINT16_MAX
+#  define UINT_LEAST32_MAX UINT32_MAX
+#  define UINT_LEAST64_MAX UINT64_MAX
+
+#  define INTPTR_MIN  INT64_MIN
+#  define INTPTR_MAX  INT64_MAX
+#  define UINTPTR_MAX UINT64_MAX
+
+#  define INTMAX_MIN  INT64_MIN
+#  define INTMAX_MAX  INT64_MAX
+#  define UINTMAX_MAX UINT64_MAX
+
+#  define PTRDIFF_MIN INT64_MIN
+#  define PTRDIFF_MAX INT64_MAX
+
+#  define SIZE_MAX UINT64_MAX
+
+#  define INT8_C(X)   ((::int_least8_t)(X))
+#  define INT16_C(X)  ((::int_least16_t)(X))
+#  define INT32_C(X)  ((::int_least32_t)(X))
+#  define INT64_C(X)  ((::int_least64_t)(X))
+#  define UINT8_C(X)  ((::uint_least8_t)(X))
+#  define UINT16_C(X) ((::uint_least16_t)(X))
+#  define UINT32_C(X) ((::uint_least32_t)(X))
+#  define UINT64_C(X) ((::uint_least64_t)(X))
+
+#  define INTMAX_C(X)  ((::intmax_t)(X))
+#  define UINTMAX_C(X) ((::uintmax_t)(X))
+#endif // ^^^ _CCCL_COMPILER(NVRTC)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+using ::int16_t;
+using ::int32_t;
+using ::int64_t;
+using ::int8_t;
+using ::uint16_t;
+using ::uint32_t;
+using ::uint64_t;
+using ::uint8_t;
+
+using ::int_fast16_t;
+using ::int_fast32_t;
+using ::int_fast64_t;
+using ::int_fast8_t;
+using ::uint_fast16_t;
+using ::uint_fast32_t;
+using ::uint_fast64_t;
+using ::uint_fast8_t;
+
+using ::int_least16_t;
+using ::int_least32_t;
+using ::int_least64_t;
+using ::int_least8_t;
+using ::uint_least16_t;
+using ::uint_least32_t;
+using ::uint_least64_t;
+using ::uint_least8_t;
+
+using ::intptr_t;
+using ::uintptr_t;
+
+using ::intmax_t;
+using ::uintmax_t;
+
+_LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_POP_MACROS
 
diff --git a/libcudacxx/include/cuda/std/cstring b/libcudacxx/include/cuda/std/cstring
new file mode 100644
index 00000000000..4ad1642f420
--- /dev/null
+++ b/libcudacxx/include/cuda/std/cstring
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_CSTRING
+#define _CUDA_STD_CSTRING
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cstddef/types.h>
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <cstring>
+#endif // !_CCCL_COMPILER(NVRTC)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+using ::memcpy;
+using ::memset;
+using ::size_t;
+
+_LIBCUDACXX_HIDE_FROM_ABI const void* memchr(const void* __ptr, int __c, size_t __n) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_HOST,
+    (return ::std::memchr(__ptr, __c, __n);),
+    (auto __p = static_cast<const unsigned char*>(__ptr); const auto __p_end = __p + __n;
+
+     while (__p != __p_end) {
+       if (*__p == static_cast<unsigned char>(__c))
+       {
+         return __p;
+       }
+       ++__p;
+     }
+
+     return nullptr;))
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI void* memchr(void* __ptr, int __c, size_t __n) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_HOST,
+                    (return ::std::memchr(__ptr, __c, __n);),
+                    (return const_cast<void*>(_CUDA_VSTD::memchr(const_cast<const void*>(__ptr), __c, __n));))
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI void* memmove(void* __dst, const void* __src, size_t __n) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_HOST,
+    (return ::std::memmove(__dst, __src, __n);),
+    (auto __d = (__dst <= __src) ? static_cast<unsigned char*>(__dst) : (static_cast<unsigned char*>(__dst) + __n - 1);
+     auto __s = (__dst <= __src) ? static_cast<const unsigned char*>(__src)
+                                 : (static_cast<const unsigned char*>(__src) + __n - 1);
+     const auto __inc   = (__dst <= __src) ? 1 : -1;
+     const auto __d_end = (__dst <= __src) ? (__d + __n) : (static_cast<unsigned char*>(__dst) - 1);
+
+     while (__d != __d_end) {
+       *__d = *__s;
+       __d += __inc;
+       __s += __inc;
+     }
+
+     return __dst;))
+}
+
+_LIBCUDACXX_HIDE_FROM_ABI int memcmp(const void* __lhs, const void* __rhs, size_t __n) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_HOST,
+    (return ::std::memcmp(__lhs, __rhs, __n);),
+    (auto __l = static_cast<const unsigned char*>(__lhs); auto __r = static_cast<const unsigned char*>(__rhs);
+     const auto __l_end                                            = __l + __n;
+
+     while (__l != __l_end) {
+       if (*__l != *__r)
+       {
+         return *__l < *__r ? -1 : 1;
+       }
+       ++__l;
+       ++__r;
+     }
+
+     return 0;))
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _CUDA_STD_CSTRING
diff --git a/libcudacxx/include/cuda/std/detail/__access_property b/libcudacxx/include/cuda/std/detail/__access_property
index 9ab8eac51d8..7e4adb04268 100644
--- a/libcudacxx/include/cuda/std/detail/__access_property
+++ b/libcudacxx/include/cuda/std/detail/__access_property
@@ -129,6 +129,8 @@
  * (v. August 20, 2021)
  */
 
+#include <cuda_runtime_api.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 namespace __detail_ap
diff --git a/libcudacxx/include/cuda/std/detail/__annotated_ptr b/libcudacxx/include/cuda/std/detail/__annotated_ptr
index 1991fdab2e2..7a684746f0c 100644
--- a/libcudacxx/include/cuda/std/detail/__annotated_ptr
+++ b/libcudacxx/include/cuda/std/detail/__annotated_ptr
@@ -137,15 +137,16 @@ namespace __detail_ap
 template <typename _Property>
 _CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
 {
+#if _CCCL_HAS_CUDA_COMPILER
   if (std::is_same<_Property, access_property::shared>::value == true)
   {
     bool __b = __isShared(__ptr);
     _CCCL_ASSERT(__b, "");
-#if defined(_CCCL_BUILTIN_ASSUME)
+#  if defined(_CCCL_BUILTIN_ASSUME)
     _CCCL_BUILTIN_ASSUME(__b);
-#else // ^^^ _CCCL_BUILTIN_ASSUME ^^^ / vvv !_CCCL_BUILTIN_ASSUME vvv
+#  else // ^^^ _CCCL_BUILTIN_ASSUME ^^^ / vvv !_CCCL_BUILTIN_ASSUME vvv
     (void) __b;
-#endif // !_CCCL_BUILTIN_ASSUME
+#  endif // !_CCCL_BUILTIN_ASSUME
   }
   else if (std::is_same<_Property, access_property::global>::value == true
            || std::is_same<_Property, access_property::normal>::value == true
@@ -155,12 +156,13 @@ _CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
   {
     bool __b = __isGlobal(__ptr);
     _CCCL_ASSERT(__b, "");
-#if defined(_CCCL_BUILTIN_ASSUME)
+#  if defined(_CCCL_BUILTIN_ASSUME)
     _CCCL_BUILTIN_ASSUME(__b);
-#else // ^^^ !_CCCL_BUILTIN_ASSUME ^^^ / vvv _CCCL_BUILTIN_ASSUME vvv
+#  else // ^^^ !_CCCL_BUILTIN_ASSUME ^^^ / vvv _CCCL_BUILTIN_ASSUME vvv
     (void) __b;
-#endif // !_CCCL_BUILTIN_ASSUME
+#  endif // !_CCCL_BUILTIN_ASSUME
   }
+#endif // _CCCL_HAS_CUDA_COMPILER
 
   return __ptr;
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index 1706bd2937c..23137ee5109 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -144,8 +144,8 @@ extern "C++" {
 #  endif //  _CCCL_COMPILER(CLANG)
 
 #  ifdef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-typedef unsigned short char16_t;
-typedef unsigned int char32_t;
+using char16_t = unsigned short;
+using char32_t = unsigned int;
 #  endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
 
 #  ifdef _LIBCUDACXX_DEBUG
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
index 37bb25bf34a..d4851ef2a80 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
@@ -751,7 +751,6 @@ template <class BidirectionalIterator, class Compare>
 #include <cuda/std/__type_traits/remove_const.h>
 #include <cuda/std/bit>
 #include <cuda/std/cstddef>
-#include <cuda/std/detail/libcxx/include/cstring>
 #include <cuda/std/functional>
 #include <cuda/std/initializer_list>
 #include <cuda/std/type_traits>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/array b/libcudacxx/include/cuda/std/detail/libcxx/include/array
index 00097827fca..7fa7918d033 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/array
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/array
@@ -168,18 +168,18 @@ template <class _Tp, size_t _Size>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT array
 {
   // types:
-  typedef array __self;
-  typedef _Tp value_type;
-  typedef value_type& reference;
-  typedef const value_type& const_reference;
-  typedef value_type* iterator;
-  typedef const value_type* const_iterator;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef _CUDA_VSTD::reverse_iterator<iterator> reverse_iterator;
-  typedef _CUDA_VSTD::reverse_iterator<const_iterator> const_reverse_iterator;
+  using __self                 = array;
+  using value_type             = _Tp;
+  using reference              = value_type&;
+  using const_reference        = const value_type&;
+  using iterator               = value_type*;
+  using const_iterator         = const value_type*;
+  using pointer                = value_type*;
+  using const_pointer          = const value_type*;
+  using size_type              = size_t;
+  using difference_type        = ptrdiff_t;
+  using reverse_iterator       = _CUDA_VSTD::reverse_iterator<iterator>;
+  using const_reverse_iterator = _CUDA_VSTD::reverse_iterator<const_iterator>;
 
   _Tp __elems_[_Size];
 
@@ -321,20 +321,20 @@ template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT array<_Tp, 0>
 {
   // types:
-  typedef array __self;
-  typedef _Tp value_type;
-  typedef value_type& reference;
-  typedef const value_type& const_reference;
-  typedef value_type* iterator;
-  typedef const value_type* const_iterator;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef _CUDA_VSTD::reverse_iterator<iterator> reverse_iterator;
-  typedef _CUDA_VSTD::reverse_iterator<const_iterator> const_reverse_iterator;
-
-  typedef conditional_t<is_const<_Tp>::value, const char, char> _CharType;
+  using __self                 = array;
+  using value_type             = _Tp;
+  using reference              = value_type&;
+  using const_reference        = const value_type&;
+  using iterator               = value_type*;
+  using const_iterator         = const value_type*;
+  using pointer                = value_type*;
+  using const_pointer          = const value_type*;
+  using size_type              = size_t;
+  using difference_type        = ptrdiff_t;
+  using reverse_iterator       = _CUDA_VSTD::reverse_iterator<iterator>;
+  using const_reverse_iterator = _CUDA_VSTD::reverse_iterator<const_iterator>;
+
+  using _CharType = conditional_t<is_const<_Tp>::value, const char, char>;
 
   struct _ArrayInStructT
   {
@@ -549,7 +549,7 @@ template <size_t _Ip, class _Tp, size_t _Size>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, array<_Tp, _Size>>
 {
   static_assert(_Ip < _Size, "Index out of bounds in std::tuple_element<> (std::array)");
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <size_t _Ip, class _Tp, size_t _Size>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/chrono b/libcudacxx/include/cuda/std/detail/libcxx/include/chrono
index ce9d1d4f891..dd3893da469 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/chrono
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/chrono
@@ -907,8 +907,8 @@ struct __is_duration<const volatile duration<_Rep, _Period>> : true_type
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT common_type<chrono::duration<_Rep1, _Period1>, chrono::duration<_Rep2, _Period2>>
 {
-  typedef chrono::duration<typename common_type<_Rep1, _Rep2>::type, typename __ratio_gcd<_Period1, _Period2>::type>
-    type;
+  using type =
+    chrono::duration<typename common_type<_Rep1, _Rep2>::type, typename __ratio_gcd<_Period1, _Period2>::type>;
 };
 
 namespace chrono
@@ -937,7 +937,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, true, false>
 {
   _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToDuration operator()(const _FromDuration& __fd) const
   {
-    typedef typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type _Ct;
+    using _Ct = typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type;
     return _ToDuration(
       static_cast<typename _ToDuration::rep>(static_cast<_Ct>(__fd.count()) / static_cast<_Ct>(_Period::den)));
   }
@@ -948,7 +948,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, false, true>
 {
   _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToDuration operator()(const _FromDuration& __fd) const
   {
-    typedef typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type _Ct;
+    using _Ct = typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type;
     return _ToDuration(
       static_cast<typename _ToDuration::rep>(static_cast<_Ct>(__fd.count()) * static_cast<_Ct>(_Period::num)));
   }
@@ -959,7 +959,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, false, false>
 {
   _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToDuration operator()(const _FromDuration& __fd) const
   {
-    typedef typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type _Ct;
+    using _Ct = typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type;
     return _ToDuration(static_cast<typename _ToDuration::rep>(
       static_cast<_Ct>(__fd.count()) * static_cast<_Ct>(_Period::num) / static_cast<_Ct>(_Period::den)));
   }
@@ -1079,12 +1079,12 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT duration
 
   public:
     static const bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
-    typedef ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value> type;
+    using type              = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
   };
 
 public:
-  typedef _Rep rep;
-  typedef typename _Period::type period;
+  using rep    = _Rep;
+  using period = typename _Period::type;
 
 private:
   rep __rep_;
@@ -1195,17 +1195,17 @@ public:
   }
 };
 
-typedef duration<long long, nano> nanoseconds;
-typedef duration<long long, micro> microseconds;
-typedef duration<long long, milli> milliseconds;
-typedef duration<long long> seconds;
-typedef duration<long, ratio<60>> minutes;
-typedef duration<long, ratio<3600>> hours;
+using nanoseconds  = duration<long long, nano>;
+using microseconds = duration<long long, micro>;
+using milliseconds = duration<long long, milli>;
+using seconds      = duration<long long>;
+using minutes      = duration<long, ratio<60>>;
+using hours        = duration<long, ratio<3600>>;
 #if _CCCL_STD_VER > 2011
-typedef duration<int, ratio_multiply<ratio<24>, hours::period>> days;
-typedef duration<int, ratio_multiply<ratio<7>, days::period>> weeks;
-typedef duration<int, ratio_multiply<ratio<146097, 400>, days::period>> years;
-typedef duration<int, ratio_divide<years::period, ratio<12>>> months;
+using days   = duration<int, ratio_multiply<ratio<24>, hours::period>>;
+using weeks  = duration<int, ratio_multiply<ratio<7>, days::period>>;
+using years  = duration<int, ratio_multiply<ratio<146097, 400>, days::period>>;
+using months = duration<int, ratio_divide<years::period, ratio<12>>>;
 #endif // _CCCL_STD_VER > 2011
 // Duration ==
 
@@ -1214,7 +1214,7 @@ struct __duration_eq
 {
   _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const
   {
-    typedef typename common_type<_LhsDuration, _RhsDuration>::type _Ct;
+    using _Ct = typename common_type<_LhsDuration, _RhsDuration>::type;
     return _Ct(__lhs).count() == _Ct(__rhs).count();
   }
 };
@@ -1251,7 +1251,7 @@ struct __duration_lt
 {
   _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const
   {
-    typedef typename common_type<_LhsDuration, _RhsDuration>::type _Ct;
+    using _Ct = typename common_type<_LhsDuration, _RhsDuration>::type;
     return _Ct(__lhs).count() < _Ct(__rhs).count();
   }
 };
@@ -1305,7 +1305,7 @@ template <class _Rep1, class _Period1, class _Rep2, class _Period2>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
 operator+(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
-  typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Cd;
+  using _Cd = typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type;
   return _Cd(_Cd(__lhs).count() + _Cd(__rhs).count());
 }
 
@@ -1315,7 +1315,7 @@ template <class _Rep1, class _Period1, class _Rep2, class _Period2>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
 operator-(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
-  typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Cd;
+  using _Cd = typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type;
   return _Cd(_Cd(__lhs).count() - _Cd(__rhs).count());
 }
 
@@ -1326,8 +1326,8 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<is_convertible<_Rep2, typename c
                                                 duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
 operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
 {
-  typedef typename common_type<_Rep1, _Rep2>::type _Cr;
-  typedef duration<_Cr, _Period> _Cd;
+  using _Cr = typename common_type<_Rep1, _Rep2>::type;
+  using _Cd = duration<_Cr, _Period>;
   return _Cd(_Cd(__d).count() * static_cast<_Cr>(__s));
 }
 
@@ -1347,8 +1347,8 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
 operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
 {
-  typedef typename common_type<_Rep1, _Rep2>::type _Cr;
-  typedef duration<_Cr, _Period> _Cd;
+  using _Cr = typename common_type<_Rep1, _Rep2>::type;
+  using _Cd = duration<_Cr, _Period>;
   return _Cd(_Cd(__d).count() / static_cast<_Cr>(__s));
 }
 
@@ -1356,7 +1356,7 @@ template <class _Rep1, class _Period1, class _Rep2, class _Period2>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<_Rep1, _Rep2>::type
 operator/(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
-  typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Ct;
+  using _Ct = typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type;
   return _Ct(__lhs).count() / _Ct(__rhs).count();
 }
 
@@ -1368,8 +1368,8 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
 operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
 {
-  typedef typename common_type<_Rep1, _Rep2>::type _Cr;
-  typedef duration<_Cr, _Period> _Cd;
+  using _Cr = typename common_type<_Rep1, _Rep2>::type;
+  using _Cd = duration<_Cr, _Period>;
   return _Cd(_Cd(__d).count() % static_cast<_Cr>(__s));
 }
 
@@ -1377,8 +1377,8 @@ template <class _Rep1, class _Period1, class _Rep2, class _Period2>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
 operator%(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
-  typedef typename common_type<_Rep1, _Rep2>::type _Cr;
-  typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Cd;
+  using _Cr = typename common_type<_Rep1, _Rep2>::type;
+  using _Cd = typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type;
   return _Cd(static_cast<_Cr>(_Cd(__lhs).count()) % static_cast<_Cr>(_Cd(__rhs).count()));
 }
 
@@ -1393,10 +1393,10 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT time_point
                 "Second template parameter of time_point must be a std::chrono::duration");
 
 public:
-  typedef _Clock clock;
-  typedef _Duration duration;
-  typedef typename duration::rep rep;
-  typedef typename duration::period period;
+  using clock    = _Clock;
+  using duration = _Duration;
+  using rep      = typename duration::rep;
+  using period   = typename duration::period;
 
 private:
   duration __d_;
@@ -1454,7 +1454,7 @@ template <class _Clock, class _Duration1, class _Duration2>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT
 common_type<chrono::time_point<_Clock, _Duration1>, chrono::time_point<_Clock, _Duration2>>
 {
-  typedef chrono::time_point<_Clock, typename common_type<_Duration1, _Duration2>::type> type;
+  using type = chrono::time_point<_Clock, typename common_type<_Duration1, _Duration2>::type>;
 };
 
 namespace chrono
@@ -1558,7 +1558,7 @@ _LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX14 time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type>
 operator+(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
-  typedef time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type> _Tr;
+  using _Tr = time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type>;
   return _Tr(__lhs.time_since_epoch() + __rhs);
 }
 
@@ -1579,7 +1579,7 @@ _LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX14 time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type>
 operator-(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
-  typedef time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type> _Ret;
+  using _Ret = time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type>;
   return _Ret(__lhs.time_since_epoch() - __rhs);
 }
 
@@ -1598,10 +1598,10 @@ operator-(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
 {
 public:
-  typedef _LIBCUDACXX_SYS_CLOCK_DURATION duration;
-  typedef duration::rep rep;
-  typedef duration::period period;
-  typedef chrono::time_point<system_clock> time_point;
+  using duration                                    = _LIBCUDACXX_SYS_CLOCK_DURATION;
+  using rep                                         = duration::rep;
+  using period                                      = duration::period;
+  using time_point                                  = chrono::time_point<system_clock>;
   static _CCCL_CONSTEXPR_CXX14 const bool is_steady = false;
 
   _LIBCUDACXX_HIDE_FROM_ABI static time_point now() noexcept;
@@ -1613,18 +1613,18 @@ public:
 class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
 {
 public:
-  typedef nanoseconds duration;
-  typedef duration::rep rep;
-  typedef duration::period period;
-  typedef chrono::time_point<steady_clock, duration> time_point;
+  using duration                                    = nanoseconds;
+  using rep                                         = duration::rep;
+  using period                                      = duration::period;
+  using time_point                                  = chrono::time_point<steady_clock, duration>;
   static _CCCL_CONSTEXPR_CXX14 const bool is_steady = true;
 
   static time_point now() noexcept;
 };
 
-typedef steady_clock high_resolution_clock;
+using high_resolution_clock = steady_clock;
 #else
-typedef system_clock high_resolution_clock;
+using high_resolution_clock = system_clock;
 #endif
 
 #if _CCCL_STD_VER > 2011
@@ -3650,15 +3650,15 @@ _LIBCUDACXX_BEGIN_NAMESPACE_FILESYSTEM
 struct _FilesystemClock
 {
 #  if !defined(_LIBCUDACXX_HAS_NO_INT128)
-  typedef __int128_t rep;
-  typedef nano period;
+  using rep    = __int128_t;
+  using period = nano;
 #  else
-  typedef long long rep;
-  typedef nano period;
+  using rep    = long long;
+  using period = nano;
 #  endif
 
-  typedef chrono::duration<rep, period> duration;
-  typedef chrono::time_point<_FilesystemClock> time_point;
+  using duration   = chrono::duration<rep, period>;
+  using time_point = chrono::time_point<_FilesystemClock>;
 
   _CCCL_VISIBILITY_DEFAULT static _CCCL_CONSTEXPR_CXX14 const bool is_steady = false;
 
@@ -3666,13 +3666,13 @@ struct _FilesystemClock
 
   _LIBCUDACXX_HIDE_FROM_ABI static time_t to_time_t(const time_point& __t) noexcept
   {
-    typedef chrono::duration<rep> __secs;
+    using __secs = chrono::duration<rep>;
     return time_t(chrono::duration_cast<__secs>(__t.time_since_epoch()).count());
   }
 
   _LIBCUDACXX_HIDE_FROM_ABI static time_point from_time_t(time_t __t) noexcept
   {
-    typedef chrono::duration<rep> __secs;
+    using __secs = chrono::duration<rep>;
     return time_point(__secs(__t));
   }
 };
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
index 6d5618b7d78..6d2a701cae3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
@@ -319,10 +319,13 @@ long double    truncl(long double x);
 #endif // _CCCL_COMPILER(NVHPC)
 
 #include <cuda/std/__cmath/abs.h>
+#include <cuda/std/__cmath/exponential_functions.h>
 #include <cuda/std/__cmath/fpclassify.h>
 #include <cuda/std/__cmath/lerp.h>
 #include <cuda/std/__cmath/logarithms.h>
 #include <cuda/std/__cmath/min_max.h>
+#include <cuda/std/__cmath/roots.h>
+#include <cuda/std/__cmath/rounding_functions.h>
 #include <cuda/std/__cmath/traits.h>
 #include <cuda/std/__cstdlib/abs.h>
 #include <cuda/std/limits>
@@ -354,26 +357,16 @@ using ::atan;
 using ::atan2;
 using ::atan2f;
 using ::atanf;
-using ::ceil;
-using ::ceilf;
 using ::cos;
 using ::cosf;
 using ::cosh;
 using ::coshf;
 
-using ::exp;
-using ::expf;
-
-using ::pow;
-using ::powf;
-
 using ::sin;
 using ::sinf;
 using ::sinh;
 using ::sinhf;
 
-using ::sqrt;
-using ::sqrtf;
 using ::tan;
 using ::tanf;
 
@@ -395,30 +388,17 @@ using ::hypotf;
 using ::double_t;
 using ::float_t;
 
-using ::floor;
-using ::floorf;
-
 using ::fmod;
 using ::fmodf;
 
-using ::frexp;
-using ::frexpf;
-using ::ldexp;
-using ::ldexpf;
-
 using ::modf;
 using ::modff;
 
-using ::pow;
-using ::powf;
-
 using ::sin;
 using ::sinf;
 using ::sinh;
 using ::sinhf;
 
-using ::sqrt;
-using ::sqrtf;
 using ::tan;
 using ::tanf;
 
@@ -431,8 +411,6 @@ using ::asinh;
 using ::asinhf;
 using ::atanh;
 using ::atanhf;
-using ::cbrt;
-using ::cbrtf;
 
 using ::copysign;
 using ::copysignf;
@@ -441,102 +419,52 @@ using ::erf;
 using ::erfc;
 using ::erfcf;
 using ::erff;
-using ::exp2;
-using ::exp2f;
-using ::expm1;
-using ::expm1f;
 using ::fdim;
 using ::fdimf;
 using ::fma;
 using ::fmaf;
 using ::lgamma;
 using ::lgammaf;
-using ::llrint;
-using ::llrintf;
-using ::llround;
-using ::llroundf;
-using ::lrint;
-using ::lrintf;
-using ::lround;
-using ::lroundf;
 
 using ::nan;
 using ::nanf;
 
-using ::nearbyint;
-using ::nearbyintf;
-using ::nextafter;
-using ::nextafterf;
-using ::nexttoward;
-using ::nexttowardf;
 using ::remainder;
 using ::remainderf;
 using ::remquo;
 using ::remquof;
-using ::rint;
-using ::rintf;
-using ::round;
-using ::roundf;
-using ::scalbln;
-using ::scalblnf;
-using ::scalbn;
-using ::scalbnf;
 using ::tgamma;
 using ::tgammaf;
-using ::trunc;
-using ::truncf;
 
 using ::acosl;
 using ::asinl;
 using ::atan2l;
 using ::atanl;
-using ::ceill;
 using ::coshl;
 using ::cosl;
-using ::expl;
-using ::floorl;
 using ::fmodl;
-using ::frexpl;
-using ::ldexpl;
 using ::modfl;
-using ::powl;
 using ::sinhl;
 using ::sinl;
-using ::sqrtl;
 using ::tanl;
 
 using ::acoshl;
 using ::asinhl;
 using ::atanhl;
-using ::cbrtl;
 using ::tanhl;
 
 using ::copysignl;
 
 using ::erfcl;
 using ::erfl;
-using ::exp2l;
-using ::expm1l;
 using ::fdiml;
 using ::fmal;
 using ::hypotl;
 using ::lgammal;
-using ::llrintl;
-using ::llroundl;
-using ::lrintl;
-using ::lroundl;
 using ::nanl;
-using ::nearbyintl;
-using ::nextafterl;
-using ::nexttowardl;
 using ::remainderl;
 using ::remquol;
-using ::rintl;
-using ::roundl;
-using ::scalblnl;
-using ::scalbnl;
 using ::tgammal;
-using ::truncl;
 
 #endif // _CCCL_COMPILER(NVRTC)
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdint b/libcudacxx/include/cuda/std/detail/libcxx/include/cstdint
deleted file mode 100644
index d467c173bf9..00000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdint
+++ /dev/null
@@ -1,205 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- cstdint ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CSTDINT
-#define _LIBCUDACXX_CSTDINT
-
-/*
-    cstdint synopsis
-
-Macros:
-
-    INT8_MIN
-    INT16_MIN
-    INT32_MIN
-    INT64_MIN
-
-    INT8_MAX
-    INT16_MAX
-    INT32_MAX
-    INT64_MAX
-
-    UINT8_MAX
-    UINT16_MAX
-    UINT32_MAX
-    UINT64_MAX
-
-    INT_LEAST8_MIN
-    INT_LEAST16_MIN
-    INT_LEAST32_MIN
-    INT_LEAST64_MIN
-
-    INT_LEAST8_MAX
-    INT_LEAST16_MAX
-    INT_LEAST32_MAX
-    INT_LEAST64_MAX
-
-    UINT_LEAST8_MAX
-    UINT_LEAST16_MAX
-    UINT_LEAST32_MAX
-    UINT_LEAST64_MAX
-
-    INT_FAST8_MIN
-    INT_FAST16_MIN
-    INT_FAST32_MIN
-    INT_FAST64_MIN
-
-    INT_FAST8_MAX
-    INT_FAST16_MAX
-    INT_FAST32_MAX
-    INT_FAST64_MAX
-
-    UINT_FAST8_MAX
-    UINT_FAST16_MAX
-    UINT_FAST32_MAX
-    UINT_FAST64_MAX
-
-    INTPTR_MIN
-    INTPTR_MAX
-    UINTPTR_MAX
-
-    INTMAX_MIN
-    INTMAX_MAX
-
-    UINTMAX_MAX
-
-    PTRDIFF_MIN
-    PTRDIFF_MAX
-
-    SIG_ATOMIC_MIN
-    SIG_ATOMIC_MAX
-
-    SIZE_MAX
-
-    WCHAR_MIN
-    WCHAR_MAX
-
-    WINT_MIN
-    WINT_MAX
-
-    INT8_C(value)
-    INT16_C(value)
-    INT32_C(value)
-    INT64_C(value)
-
-    UINT8_C(value)
-    UINT16_C(value)
-    UINT32_C(value)
-    UINT64_C(value)
-
-    INTMAX_C(value)
-    UINTMAX_C(value)
-
-namespace std
-{
-
-Types:
-
-    int8_t
-    int16_t
-    int32_t
-    int64_t
-
-    uint8_t
-    uint16_t
-    uint32_t
-    uint64_t
-
-    int_least8_t
-    int_least16_t
-    int_least32_t
-    int_least64_t
-
-    uint_least8_t
-    uint_least16_t
-    uint_least32_t
-    uint_least64_t
-
-    int_fast8_t
-    int_fast16_t
-    int_fast32_t
-    int_fast64_t
-
-    uint_fast8_t
-    uint_fast16_t
-    uint_fast32_t
-    uint_fast64_t
-
-    intptr_t
-    uintptr_t
-
-    intmax_t
-    uintmax_t
-
-}  // std
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__cuda/cstdint_prelude.h>
-#include <cuda/std/climits>
-#include <cuda/std/version>
-
-#if !_CCCL_COMPILER(NVRTC)
-#  include <cstdint>
-#endif // _CCCL_COMPILER(NVRTC)
-
-_CCCL_PUSH_MACROS
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-using ::int16_t;
-using ::int32_t;
-using ::int64_t;
-using ::int8_t;
-
-using ::uint16_t;
-using ::uint32_t;
-using ::uint64_t;
-using ::uint8_t;
-
-using ::int_least16_t;
-using ::int_least32_t;
-using ::int_least64_t;
-using ::int_least8_t;
-
-using ::uint_least16_t;
-using ::uint_least32_t;
-using ::uint_least64_t;
-using ::uint_least8_t;
-
-using ::int_fast16_t;
-using ::int_fast32_t;
-using ::int_fast64_t;
-using ::int_fast8_t;
-
-using ::uint_fast16_t;
-using ::uint_fast32_t;
-using ::uint_fast64_t;
-using ::uint_fast8_t;
-
-using ::intptr_t;
-using ::uintptr_t;
-
-using ::intmax_t;
-using ::uintmax_t;
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-_CCCL_POP_MACROS
-
-#endif // _LIBCUDACXX_CSTDINT
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstring b/libcudacxx/include/cuda/std/detail/libcxx/include/cstring
deleted file mode 100644
index 45e0437a426..00000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstring
+++ /dev/null
@@ -1,60 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- cstring ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CSTRING
-#define _LIBCUDACXX_CSTRING
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if !_CCCL_COMPILER(NVRTC)
-#  include <cstring>
-#endif // !_CCCL_COMPILER(NVRTC)
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-using ::memcpy;
-using ::memset;
-using ::size_t;
-
-#if !_CCCL_COMPILER(NVRTC)
-using ::memchr;
-using ::memcmp;
-using ::memmove;
-using ::strcat;
-using ::strchr;
-using ::strcmp;
-using ::strcoll;
-using ::strcpy;
-using ::strcspn;
-using ::strncat;
-using ::strncmp;
-using ::strncpy;
-using ::strpbrk;
-using ::strrchr;
-using ::strspn;
-using ::strstr;
-using ::strxfrm;
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS
-using ::strtok;
-#  endif // _LIBCUDACXX_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS
-using ::strerror;
-using ::strlen;
-#endif // _CCCL_COMPILER(NVRTC)
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif // _LIBCUDACXX_CSTRING
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/ctime b/libcudacxx/include/cuda/std/detail/libcxx/include/ctime
index b3fb02e392b..364e8dd15f5 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/ctime
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/ctime
@@ -58,7 +58,7 @@ int timespec_get( struct timespec *ts, int base); // C++17
 #if !_CCCL_COMPILER(NVRTC)
 #  include <time.h>
 #else
-typedef long long int time_t;
+using time_t = long long int;
 #endif // _CCCL_COMPILER(NVRTC)
 
 _CCCL_PUSH_MACROS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
index 502a59573f6..04f056c91d3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
@@ -272,7 +272,7 @@ struct __optional_destruct_base;
 template <class _Tp>
 struct __optional_destruct_base<_Tp, false>
 {
-  typedef _Tp value_type;
+  using value_type = _Tp;
   static_assert(_CCCL_TRAIT(is_object, value_type),
                 "instantiation of optional with a non-object type is undefined behavior");
   union
@@ -323,7 +323,7 @@ struct __optional_destruct_base<_Tp, false>
 template <class _Tp>
 struct __optional_destruct_base<_Tp, true>
 {
-  typedef _Tp value_type;
+  using value_type = _Tp;
   static_assert(_CCCL_TRAIT(is_object, value_type),
                 "instantiation of optional with a non-object type is undefined behavior");
   union
@@ -1404,8 +1404,8 @@ template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<__enable_hash_helper<optional<_Tp>, remove_const_t<_Tp>>>
 {
 #    if _CCCL_STD_VER <= 2017 || defined(_LIBCUDACXX_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS)
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef optional<_Tp> argument_type;
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef size_t result_type;
+  using argument_type _LIBCUDACXX_DEPRECATED_IN_CXX17 = optional<_Tp>;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX17   = size_t;
 #    endif
 
   _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(const optional<_Tp>& __opt) const
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/ratio b/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
index 172442a570a..6f3419325d8 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
@@ -269,7 +269,7 @@ public:
   static constexpr intmax_t num = __s * __na / __gcd;
   static constexpr intmax_t den = __da / __gcd;
 
-  typedef ratio<num, den> type;
+  using type = ratio<num, den>;
 };
 
 template <intmax_t _Num, intmax_t _Den>
@@ -285,22 +285,22 @@ template <intmax_t _Num, intmax_t _Den>
 struct __is_ratio<ratio<_Num, _Den>> : true_type
 {};
 
-typedef ratio<1LL, 1000000000000000000LL> atto;
-typedef ratio<1LL, 1000000000000000LL> femto;
-typedef ratio<1LL, 1000000000000LL> pico;
-typedef ratio<1LL, 1000000000LL> nano;
-typedef ratio<1LL, 1000000LL> micro;
-typedef ratio<1LL, 1000LL> milli;
-typedef ratio<1LL, 100LL> centi;
-typedef ratio<1LL, 10LL> deci;
-typedef ratio<10LL, 1LL> deca;
-typedef ratio<100LL, 1LL> hecto;
-typedef ratio<1000LL, 1LL> kilo;
-typedef ratio<1000000LL, 1LL> mega;
-typedef ratio<1000000000LL, 1LL> giga;
-typedef ratio<1000000000000LL, 1LL> tera;
-typedef ratio<1000000000000000LL, 1LL> peta;
-typedef ratio<1000000000000000000LL, 1LL> exa;
+using atto  = ratio<1LL, 1000000000000000000LL>;
+using femto = ratio<1LL, 1000000000000000LL>;
+using pico  = ratio<1LL, 1000000000000LL>;
+using nano  = ratio<1LL, 1000000000LL>;
+using micro = ratio<1LL, 1000000LL>;
+using milli = ratio<1LL, 1000LL>;
+using centi = ratio<1LL, 100LL>;
+using deci  = ratio<1LL, 10LL>;
+using deca  = ratio<10LL, 1LL>;
+using hecto = ratio<100LL, 1LL>;
+using kilo  = ratio<1000LL, 1LL>;
+using mega  = ratio<1000000LL, 1LL>;
+using giga  = ratio<1000000000LL, 1LL>;
+using tera  = ratio<1000000000000LL, 1LL>;
+using peta  = ratio<1000000000000000LL, 1LL>;
+using exa   = ratio<1000000000000000000LL, 1LL>;
 
 template <class _R1, class _R2>
 struct __ratio_multiply
@@ -310,8 +310,8 @@ struct __ratio_multiply
   static const intmax_t __gcd_d1_n2 = __static_gcd<_R1::den, _R2::num>::value;
 
 public:
-  typedef typename ratio<__ll_mul<_R1::num / __gcd_n1_d2, _R2::num / __gcd_d1_n2>::value,
-                         __ll_mul<_R2::den / __gcd_n1_d2, _R1::den / __gcd_d1_n2>::value>::type type;
+  using type = typename ratio<__ll_mul<_R1::num / __gcd_n1_d2, _R2::num / __gcd_d1_n2>::value,
+                              __ll_mul<_R2::den / __gcd_n1_d2, _R1::den / __gcd_d1_n2>::value>::type;
 };
 
 template <class _R1, class _R2>
@@ -325,8 +325,8 @@ struct __ratio_divide
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
 public:
-  typedef typename ratio<__ll_mul<_R1::num / __gcd_n1_n2, _R2::den / __gcd_d1_d2>::value,
-                         __ll_mul<_R2::num / __gcd_n1_n2, _R1::den / __gcd_d1_d2>::value>::type type;
+  using type = typename ratio<__ll_mul<_R1::num / __gcd_n1_n2, _R2::den / __gcd_d1_d2>::value,
+                              __ll_mul<_R2::num / __gcd_n1_n2, _R1::den / __gcd_d1_d2>::value>::type;
 };
 
 template <class _R1, class _R2>
@@ -340,11 +340,11 @@ struct __ratio_add
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
 public:
-  typedef
+  using type =
     typename ratio_multiply<ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>,
                             ratio<__ll_add<__ll_mul<_R1::num / __gcd_n1_n2, _R2::den / __gcd_d1_d2>::value,
                                            __ll_mul<_R2::num / __gcd_n1_n2, _R1::den / __gcd_d1_d2>::value>::value,
-                                  _R2::den>>::type type;
+                                  _R2::den>>::type;
 };
 
 template <class _R1, class _R2>
@@ -358,11 +358,11 @@ struct __ratio_subtract
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
 public:
-  typedef
+  using type =
     typename ratio_multiply<ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>,
                             ratio<__ll_sub<__ll_mul<_R1::num / __gcd_n1_n2, _R2::den / __gcd_d1_d2>::value,
                                            __ll_mul<_R2::num / __gcd_n1_n2, _R1::den / __gcd_d1_d2>::value>::value,
-                                  _R2::den>>::type type;
+                                  _R2::den>>::type;
 };
 
 template <class _R1, class _R2>
@@ -456,7 +456,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT ratio_greater_equal : public bool_constant<
 template <class _R1, class _R2>
 struct __ratio_gcd
 {
-  typedef ratio<__static_gcd<_R1::num, _R2::num>::value, __static_lcm<_R1::den, _R2::den>::value> type;
+  using type = ratio<__static_gcd<_R1::num, _R2::num>::value, __static_lcm<_R1::den, _R2::den>::value>;
 };
 
 #if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index 5457b9f3d2a..aa2fdeaa368 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -723,7 +723,7 @@ struct __tuple_constraints
 template <class... _Tp>
 class _CCCL_TYPE_VISIBILITY_DEFAULT tuple
 {
-  typedef __tuple_impl<__make_tuple_indices_t<sizeof...(_Tp)>, _Tp...> _BaseT;
+  using _BaseT = __tuple_impl<__make_tuple_indices_t<sizeof...(_Tp)>, _Tp...>;
 
   _BaseT __base_;
 
@@ -739,28 +739,28 @@ public:
   template <size_t _Ip>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple>& __get_impl() & noexcept
   {
-    typedef _CCCL_NODEBUG_ALIAS __tuple_element_t<_Ip, tuple> type;
+    using type _CCCL_NODEBUG_ALIAS = __tuple_element_t<_Ip, tuple>;
     return static_cast<__tuple_leaf<_Ip, type>&>(__base_).get();
   }
 
   template <size_t _Ip>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple>& __get_impl() const& noexcept
   {
-    typedef _CCCL_NODEBUG_ALIAS __tuple_element_t<_Ip, tuple> type;
+    using type _CCCL_NODEBUG_ALIAS = __tuple_element_t<_Ip, tuple>;
     return static_cast<const __tuple_leaf<_Ip, type>&>(__base_).get();
   }
 
   template <size_t _Ip>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple>&& __get_impl() && noexcept
   {
-    typedef _CCCL_NODEBUG_ALIAS __tuple_element_t<_Ip, tuple> type;
+    using type _CCCL_NODEBUG_ALIAS = __tuple_element_t<_Ip, tuple>;
     return static_cast<type&&>(static_cast<__tuple_leaf<_Ip, type>&&>(__base_).get());
   }
 
   template <size_t _Ip>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple>&& __get_impl() const&& noexcept
   {
-    typedef _CCCL_NODEBUG_ALIAS __tuple_element_t<_Ip, tuple> type;
+    using type _CCCL_NODEBUG_ALIAS = __tuple_element_t<_Ip, tuple>;
     return static_cast<const type&&>(static_cast<const __tuple_leaf<_Ip, type>&&>(__base_).get());
   }
 
@@ -1216,7 +1216,7 @@ struct __tuple_cat_type;
 template <class... _Ttypes, class... _Utypes>
 struct __tuple_cat_type<tuple<_Ttypes...>, __tuple_types<_Utypes...>>
 {
-  typedef _CCCL_NODEBUG_ALIAS tuple<_Ttypes..., _Utypes...> type;
+  using type _CCCL_NODEBUG_ALIAS = tuple<_Ttypes..., _Utypes...>;
 };
 
 template <class _ResultTuple, bool _Is_Tuple0TupleLike, class... _Tuples>
@@ -1226,8 +1226,8 @@ struct __tuple_cat_return_1
 template <class... _Types, class _Tuple0>
 struct __tuple_cat_return_1<tuple<_Types...>, true, _Tuple0>
 {
-  typedef _CCCL_NODEBUG_ALIAS
-    typename __tuple_cat_type<tuple<_Types...>, __make_tuple_types_t<remove_cvref_t<_Tuple0>>>::type type;
+  using type _CCCL_NODEBUG_ALIAS =
+    typename __tuple_cat_type<tuple<_Types...>, __make_tuple_types_t<remove_cvref_t<_Tuple0>>>::type;
 };
 
 template <class... _Types, class _Tuple0, class _Tuple1, class... _Tuples>
@@ -1250,7 +1250,7 @@ struct __tuple_cat_return<_Tuple0, _Tuples...>
 template <>
 struct __tuple_cat_return<>
 {
-  typedef _CCCL_NODEBUG_ALIAS tuple<> type;
+  using type _CCCL_NODEBUG_ALIAS = tuple<>;
 };
 
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<> tuple_cat()
@@ -1264,8 +1264,8 @@ struct __tuple_cat_return_ref_imp;
 template <class... _Types, size_t... _I0, class _Tuple0>
 struct __tuple_cat_return_ref_imp<tuple<_Types...>, __tuple_indices<_I0...>, _Tuple0>
 {
-  typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tuple0> _T0;
-  typedef tuple<_Types..., __copy_cvref_t<_Tuple0, __tuple_element_t<_I0, _T0>>&&...> type;
+  using _T0 _CCCL_NODEBUG_ALIAS = remove_reference_t<_Tuple0>;
+  using type                    = tuple<_Types..., __copy_cvref_t<_Tuple0, __tuple_element_t<_I0, _T0>>&&...>;
 };
 
 template <class... _Types, size_t... _I0, class _Tuple0, class _Tuple1, class... _Tuples>
@@ -1306,8 +1306,8 @@ struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J
   operator()(tuple<_Types...> __t, _Tuple0&& __t0, _Tuple1&& __t1, _Tuples&&... __tpls)
   {
     (void) __t;
-    typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tuple0> _T0;
-    typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tuple1> _T1;
+    using _T0 _CCCL_NODEBUG_ALIAS = remove_reference_t<_Tuple0>;
+    using _T1 _CCCL_NODEBUG_ALIAS = remove_reference_t<_Tuple1>;
     return __tuple_cat<tuple<_Types..., __copy_cvref_t<_Tuple0, __tuple_element_t<_J0, _T0>>&&...>,
                        __make_tuple_indices_t<sizeof...(_Types) + tuple_size<_T0>::value>,
                        __make_tuple_indices_t<tuple_size<_T1>::value>>()(
@@ -1322,7 +1322,7 @@ template <class _Tuple0, class... _Tuples>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename __tuple_cat_return<_Tuple0, _Tuples...>::type
 tuple_cat(_Tuple0&& __t0, _Tuples&&... __tpls)
 {
-  typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tuple0> _T0;
+  using _T0 _CCCL_NODEBUG_ALIAS = remove_reference_t<_Tuple0>;
   return __tuple_cat<tuple<>, __tuple_indices<>, __make_tuple_indices_t<tuple_size<_T0>::value>>()(
     tuple<>(), _CUDA_VSTD::forward<_Tuple0>(__t0), _CUDA_VSTD::forward<_Tuples>(__tpls)...);
 }
diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index 98c63813b7b..ad529f2082b 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -22,7 +22,10 @@
 #endif // no system header
 
 #include <cuda/std/__bit/bit_cast.h>
-#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/climits>
 #include <cuda/std/version>
 
@@ -46,7 +49,46 @@ enum float_denorm_style
   denorm_present       = 1
 };
 
-template <class _Tp, bool = is_arithmetic<_Tp>::value>
+enum class __numeric_limits_type
+{
+  __integral,
+  __bool,
+  __floating_point,
+  __other,
+};
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __numeric_limits_type __make_numeric_limits_type()
+{
+#if !defined(_CCCL_NO_IF_CONSTEXPR)
+  _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_same, _Tp, bool))
+  {
+    return __numeric_limits_type::__bool;
+  }
+  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_integral, _Tp))
+  {
+    return __numeric_limits_type::__integral;
+  }
+  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp))
+  {
+    return __numeric_limits_type::__floating_point;
+  }
+  else
+  {
+    return __numeric_limits_type::__other;
+  }
+#else // ^^^ !_CCCL_NO_IF_CONSTEXPR ^^^ // vvv _CCCL_NO_IF_CONSTEXPR vvv
+  return _CCCL_TRAIT(is_same, _Tp, bool)
+         ? __numeric_limits_type::__bool
+         : (_CCCL_TRAIT(is_integral, _Tp)
+              ? __numeric_limits_type::__integral
+              : (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp)
+                   ? __numeric_limits_type::__floating_point
+                   : __numeric_limits_type::__other));
+#endif // _CCCL_NO_IF_CONSTEXPR
+}
+
+template <class _Tp, __numeric_limits_type = __make_numeric_limits_type<_Tp>()>
 class __numeric_limits_impl
 {
 public:
@@ -135,7 +177,7 @@ struct __int_min<_Tp, __digits, false>
 };
 
 template <class _Tp>
-class __numeric_limits_impl<_Tp, true>
+class __numeric_limits_impl<_Tp, __numeric_limits_type::__integral>
 {
 public:
   using type = _Tp;
@@ -212,7 +254,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<bool, true>
+class __numeric_limits_impl<bool, __numeric_limits_type::__bool>
 {
 public:
   using type = bool;
@@ -286,7 +328,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<float, true>
+class __numeric_limits_impl<float, __numeric_limits_type::__floating_point>
 {
 public:
   using type = float;
@@ -381,7 +423,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<double, true>
+class __numeric_limits_impl<double, __numeric_limits_type::__floating_point>
 {
 public:
   using type = double;
@@ -476,7 +518,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<long double, true>
+class __numeric_limits_impl<long double, __numeric_limits_type::__floating_point>
 {
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
@@ -551,6 +593,156 @@ public:
 #endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
 };
 
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+template <>
+class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = __half;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 11;
+  static constexpr int digits10     = 3;
+  static constexpr int max_digits10 = 5;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type(__half_raw{0x0400u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type(__half_raw{0x7bffu});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type(__half_raw{0xfbffu});
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(__half_raw{0x1400u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(__half_raw{0x3800u});
+  }
+
+  static constexpr int min_exponent   = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent   = 16;
+  static constexpr int max_exponent10 = 4;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(__half_raw{0x7c00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(__half_raw{0x7e00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(__half_raw{0x7d00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(__half_raw{0x0001u});
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+template <>
+class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = __nv_bfloat16;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 8;
+  static constexpr int digits10     = 2;
+  static constexpr int max_digits10 = 4;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x0080u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7f7fu});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type(__nv_bfloat16_raw{0xff7fu});
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x3c00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x3f00u});
+  }
+
+  static constexpr int min_exponent   = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent   = 128;
+  static constexpr int max_exponent10 = 38;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7f80u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7fc0u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7fa0u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x0001u});
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+#endif // _LIBCUDACXX_HAS_NVBF16
+
 template <class _Tp>
 class numeric_limits : public __numeric_limits_impl<_Tp>
 {};
diff --git a/libcudacxx/include/cuda/std/linalg b/libcudacxx/include/cuda/std/linalg
new file mode 100644
index 00000000000..503bb2d23fc
--- /dev/null
+++ b/libcudacxx/include/cuda/std/linalg
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_LINALG
+#define _CUDA_STD_LINALG
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__linalg/conjugate_transposed.h>
+#include <cuda/std/__linalg/conjugated.h>
+#include <cuda/std/__linalg/scaled.h>
+#include <cuda/std/__linalg/transposed.h>
+#include <cuda/std/version>
+
+#endif // _CUDA_STD_LINALG
diff --git a/libcudacxx/include/cuda/stream_ref b/libcudacxx/include/cuda/stream_ref
index a8b044909eb..857a35f6da4 100644
--- a/libcudacxx/include/cuda/stream_ref
+++ b/libcudacxx/include/cuda/stream_ref
@@ -38,9 +38,6 @@ private:
 }  // cuda
 */
 
-#include <cuda_runtime_api.h>
-// cuda_runtime_api needs to come first
-
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -51,6 +48,8 @@ private:
 #  pragma system_header
 #endif // no system header
 
+#include <cuda_runtime_api.h>
+
 #include <cuda/std/__cuda/api_wrapper.h>
 #include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/cstddef>
diff --git a/libcudacxx/include/cuda/type_traits b/libcudacxx/include/cuda/type_traits
new file mode 100644
index 00000000000..9b732130a65
--- /dev/null
+++ b/libcudacxx/include/cuda/type_traits
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_TYPE_TRAITS_
+#define _CUDA_TYPE_TRAITS_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__type_traits/is_floating_point.h>
+#include <cuda/std/type_traits>
+
+#endif // _CUDA_TYPE_TRAITS_
diff --git a/libcudacxx/test/libcudacxx/cuda/cmath/round_up_down.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath/round_up_down.pass.cpp
new file mode 100644
index 00000000000..8cec94ef519
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/cmath/round_up_down.pass.cpp
@@ -0,0 +1,195 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++98, c++03, c++11, c++14
+
+#include <cuda/cmath>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
+#include <cuda/std/utility>
+
+#include "cuda/std/__type_traits/conditional.h"
+#include "cuda/std/__type_traits/is_enum.h"
+#include "cuda/std/__type_traits/underlying_type.h"
+#include "test_macros.h"
+
+#if !defined(TEST_COMPILER_NVRTC)
+#  include <cstdint>
+#endif // !TEST_COMPILER_NVRTC
+
+template <class T, class U>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test()
+{
+  constexpr auto maxv = cuda::std::numeric_limits<T>::max();
+  using CommonType    = cuda::std::common_type_t<T, U>;
+  // ensure that we return the right type
+  static_assert(cuda::std::is_same<decltype(cuda::round_up(T(0), U(1))), CommonType>::value, "");
+  static_assert(cuda::std::is_same<decltype(cuda::round_down(T(0), U(1))), CommonType>::value, "");
+
+  assert(cuda::round_up(T(0), U(1)) == CommonType(0));
+  assert(cuda::round_up(T(1), U(1)) == CommonType(1));
+  assert(cuda::round_up(T(45), U(32)) == CommonType(64));
+  // ensure that we are resilient against overflow
+  assert(cuda::round_up(maxv, U(1)) == maxv);
+  assert(cuda::round_up(maxv, maxv) == maxv);
+
+  assert(cuda::round_down(T(0), U(1)) == CommonType(0));
+  assert(cuda::round_down(T(1), U(1)) == CommonType(1));
+  assert(cuda::round_down(T(78), U(64)) == CommonType(64));
+  // ensure that we are resilient against overflow
+  assert(cuda::round_down(maxv, U(1)) == maxv);
+  assert(cuda::round_down(maxv, maxv) == maxv);
+}
+
+template <class T, class = void>
+struct relaxed_underlying_type
+{
+  using type = T;
+};
+
+template <class T>
+struct relaxed_underlying_type<T, cuda::std::void_t<cuda::std::underlying_type_t<T>>>
+{
+  using type = cuda::std::underlying_type_t<T>;
+};
+
+template <class T1, class U1>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test_enum()
+{
+  using T          = typename relaxed_underlying_type<T1>::type;
+  using U          = typename relaxed_underlying_type<U1>::type;
+  using CommonType = cuda::std::common_type_t<T, U>;
+  // ensure that we return the right type
+  static_assert(cuda::std::is_same<decltype(cuda::round_up(T(0), U(1))), CommonType>::value, "");
+  static_assert(cuda::std::is_same<decltype(cuda::round_down(T(0), U(1))), CommonType>::value, "");
+
+  assert(cuda::round_up(T(0), U(1)) == CommonType(0));
+  assert(cuda::round_up(T(1), U(1)) == CommonType(1));
+  assert(cuda::round_up(T(46), U(32)) == CommonType(64));
+
+  assert(cuda::round_down(T(0), U(1)) == CommonType(0));
+  assert(cuda::round_down(T(1), U(1)) == CommonType(1));
+  assert(cuda::round_down(T(78), U(64)) == CommonType(64));
+}
+
+enum Enum1 : short
+{
+  E1,
+  E2,
+  E3 = 64,
+  E4 = 78
+};
+enum class Enum2 : long
+{
+  C1,
+  C2,
+  C3 = 64,
+  C4 = 78
+};
+
+template <class T>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test()
+{
+  // Builtin integer types:
+  test<T, char>();
+  test<T, signed char>();
+  test<T, unsigned char>();
+
+  test<T, short>();
+  test<T, unsigned short>();
+
+  test<T, int>();
+  test<T, unsigned int>();
+
+  test<T, long>();
+  test<T, unsigned long>();
+
+  test<T, long long>();
+  test<T, unsigned long long>();
+
+  test_enum<T, Enum1>();
+  test_enum<T, Enum2>();
+  test_enum<Enum1, T>();
+  test_enum<Enum2, T>();
+
+#if !defined(TEST_COMPILER_NVRTC)
+  // cstdint types:
+  test<T, std::size_t>();
+  test<T, std::ptrdiff_t>();
+  test<T, std::intptr_t>();
+  test<T, std::uintptr_t>();
+
+  test<T, std::int8_t>();
+  test<T, std::int16_t>();
+  test<T, std::int32_t>();
+  test<T, std::int64_t>();
+
+  test<T, std::uint8_t>();
+  test<T, std::uint16_t>();
+  test<T, std::uint32_t>();
+  test<T, std::uint64_t>();
+#endif // !TEST_COMPILER_NVRTC
+
+#if !defined(TEST_HAS_NO_INT128_T)
+  test<T, __int128_t>();
+  test<T, __uint128_t>();
+#endif // !TEST_HAS_NO_INT128_T
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  // Builtin integer types:
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+
+  test<short>();
+  test<unsigned short>();
+
+  test<int>();
+  test<unsigned int>();
+
+  test<long>();
+  test<unsigned long>();
+
+  test<long long>();
+  test<unsigned long long>();
+
+#if !defined(TEST_COMPILER_NVRTC)
+  // cstdint types:
+  test<std::size_t>();
+  test<std::ptrdiff_t>();
+  test<std::intptr_t>();
+  test<std::uintptr_t>();
+
+  test<std::int8_t>();
+  test<std::int16_t>();
+  test<std::int32_t>();
+  test<std::int64_t>();
+
+  test<std::uint8_t>();
+  test<std::uint16_t>();
+  test<std::uint32_t>();
+  test<std::uint64_t>();
+#endif // !TEST_COMPILER_NVRTC
+
+#if !defined(TEST_HAS_NO_INT128_T)
+  test<__int128_t>();
+  test<__uint128_t>();
+#endif // !TEST_HAS_NO_INT128_T
+
+  return true;
+}
+
+int main(int arg, char** argv)
+{
+  test();
+  static_assert(test(), "");
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.fail.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.fail.cpp
new file mode 100644
index 00000000000..3b6b457e633
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.fail.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#include <cuda/std/__cccl/extended_floating_point.h>
+
+#include "test_macros.h"
+
+#if !_CCCL_HAS_NVFP8()
+#  include <cuda_fp8.h>
+#endif
+#if !defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif
+#if !defined(_CCCL_HAS_NVBF16)
+#  include <cuda_bf16.h>
+#endif
+
+int main(int, char**)
+{
+#if !_CCCL_HAS_NVFP8()
+  auto x = __nv_fp8_e4m3(1.0f);
+  unused(x);
+#else
+  static_assert(false);
+#endif
+#if !defined(_CCCL_HAS_NVFP16)
+  auto y = __half(1.0f);
+  unused(y);
+#else
+  static_assert(false);
+#endif
+#if !defined(_CCCL_HAS_NVBF16)
+  auto z = __nv_bfloat16(1.0f);
+  unused(z);
+#else
+  static_assert(false);
+#endif
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.pass.cpp
new file mode 100644
index 00000000000..fa1476611bc
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#include <cuda/std/__cccl/extended_floating_point.h>
+
+#include "test_macros.h"
+
+#if _CCCL_HAS_NVFP8()
+#  include <cuda_fp8.h>
+#endif
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif
+#if defined(_CCCL_HAS_NVBF16)
+#  include <cuda_bf16.h>
+#endif
+
+int main(int, char**)
+{
+#if _CCCL_HAS_NVFP8()
+  auto x = __nv_fp8_e4m3(1.0f);
+  unused(x);
+#endif
+#if defined(_CCCL_HAS_NVFP16)
+  auto y = __half(1.0f);
+  unused(y);
+#endif
+#if defined(_CCCL_HAS_NVBF16)
+  auto z = __nv_bfloat16(1.0f);
+  unused(z);
+#endif
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/prefetch.compile.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/prefetch.compile.pass.cpp
new file mode 100644
index 00000000000..3bdb143f8fa
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/libcxx/macros/prefetch.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/detail/__config>
+
+#include <test_macros.h>
+
+int main(int, char**)
+{
+  int memory[8];
+  _CCCL_BUILTIN_PREFETCH(memory);
+  _CCCL_BUILTIN_PREFETCH(memory, /*read-only=*/0);
+  _CCCL_BUILTIN_PREFETCH(memory, /*read-only=*/0, /*medium cache utilization=*/1);
+  unused(memory);
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
new file mode 100644
index 00000000000..b0b7a3f3b69
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
@@ -0,0 +1,119 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// keep this test in sync with `is_floating_point.pass.cpp` for `cuda::std::is_floating_point`
+
+#include <cuda/std/cstddef> // for cuda::std::nullptr_t
+#include <cuda/type_traits>
+
+#include "test_macros.h"
+
+TEST_NV_DIAG_SUPPRESS(cuda_demote_unsupported_floating_point)
+
+template <class T>
+__host__ __device__ void test_is_floating_point()
+{
+  static_assert(cuda::is_floating_point<T>::value, "");
+  static_assert(cuda::is_floating_point<const T>::value, "");
+  static_assert(cuda::is_floating_point<volatile T>::value, "");
+  static_assert(cuda::is_floating_point<const volatile T>::value, "");
+#if TEST_STD_VER > 2011
+  static_assert(cuda::is_floating_point_v<T>, "");
+  static_assert(cuda::is_floating_point_v<const T>, "");
+  static_assert(cuda::is_floating_point_v<volatile T>, "");
+  static_assert(cuda::is_floating_point_v<const volatile T>, "");
+#endif
+}
+
+template <class T>
+__host__ __device__ void test_is_not_floating_point()
+{
+  static_assert(!cuda::is_floating_point<T>::value, "");
+  static_assert(!cuda::is_floating_point<const T>::value, "");
+  static_assert(!cuda::is_floating_point<volatile T>::value, "");
+  static_assert(!cuda::is_floating_point<const volatile T>::value, "");
+#if TEST_STD_VER > 2011
+  static_assert(!cuda::is_floating_point_v<T>, "");
+  static_assert(!cuda::is_floating_point_v<const T>, "");
+  static_assert(!cuda::is_floating_point_v<volatile T>, "");
+  static_assert(!cuda::is_floating_point_v<const volatile T>, "");
+#endif
+}
+
+class Empty
+{};
+
+class NotEmpty
+{
+  __host__ __device__ virtual ~NotEmpty();
+};
+
+union Union
+{};
+
+struct bit_zero
+{
+  int : 0;
+};
+
+class Abstract
+{
+  __host__ __device__ virtual ~Abstract() = 0;
+};
+
+enum Enum
+{
+  zero,
+  one
+};
+struct incomplete_type;
+
+typedef void (*FunctionPtr)();
+
+int main(int, char**)
+{
+  test_is_floating_point<float>();
+  test_is_floating_point<double>();
+  test_is_floating_point<long double>();
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_is_floating_point<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_is_floating_point<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test_is_floating_point<__nv_fp8_e4m3>();
+  test_is_floating_point<__nv_fp8_e5m2>();
+#endif // ())
+
+  test_is_not_floating_point<short>();
+  test_is_not_floating_point<unsigned short>();
+  test_is_not_floating_point<int>();
+  test_is_not_floating_point<unsigned int>();
+  test_is_not_floating_point<long>();
+  test_is_not_floating_point<unsigned long>();
+
+  test_is_not_floating_point<cuda::std::nullptr_t>();
+  test_is_not_floating_point<void>();
+  test_is_not_floating_point<int&>();
+  test_is_not_floating_point<int&&>();
+  test_is_not_floating_point<int*>();
+  test_is_not_floating_point<const int*>();
+  test_is_not_floating_point<char[3]>();
+  test_is_not_floating_point<char[]>();
+  test_is_not_floating_point<Union>();
+  test_is_not_floating_point<Empty>();
+  test_is_not_floating_point<bit_zero>();
+  test_is_not_floating_point<NotEmpty>();
+  test_is_not_floating_point<Abstract>();
+  test_is_not_floating_point<Enum>();
+  test_is_not_floating_point<FunctionPtr>();
+  test_is_not_floating_point<incomplete_type>();
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp
index 4f27784cd61..df34fa1d42e 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp
@@ -1,6 +1,9 @@
 #ifndef _MY_INT_HPP
 #define _MY_INT_HPP
 
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
 #include "test_macros.h"
 
 struct my_int_non_convertible;
@@ -22,6 +25,10 @@ template <>
 struct cuda::std::is_integral<my_int> : cuda::std::true_type
 {};
 
+template <>
+class cuda::std::numeric_limits<my_int> : public cuda::std::numeric_limits<int>
+{};
+
 // Wrapper type that's not implicitly convertible
 
 struct my_int_non_convertible
@@ -43,6 +50,10 @@ template <>
 struct cuda::std::is_integral<my_int_non_convertible> : cuda::std::true_type
 {};
 
+template <>
+class cuda::std::numeric_limits<my_int_non_convertible> : public cuda::std::numeric_limits<int>
+{};
+
 // Wrapper type that's not nothrow-constructible
 
 struct my_int_non_nothrow_constructible
@@ -62,4 +73,8 @@ template <>
 struct cuda::std::is_integral<my_int_non_nothrow_constructible> : cuda::std::true_type
 {};
 
+template <>
+class cuda::std::numeric_limits<my_int_non_nothrow_constructible> : public cuda::std::numeric_limits<int>
+{};
+
 #endif
diff --git a/libcudacxx/test/libcudacxx/std/language.support/cstdint/cstdint.syn/cstdint.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/cstdint/cstdint.syn/cstdint.pass.cpp
index 6fd2e98a33f..7607a4b17df 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/cstdint/cstdint.syn/cstdint.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/cstdint/cstdint.syn/cstdint.pass.cpp
@@ -3,221 +3,195 @@
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
-// test <cuda/std/cstdint>
+// <cuda/std/cstdint>
 
+#include <cuda/std/cassert>
+#include <cuda/std/climits>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
-// #include <cuda/std/cwchar>
-// #include <cuda/std/csignal>
-// #include <cuda/std/cwctype>
-#include <cuda/std/climits>
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
-// #include <cuda/std/limits>
-#include <cuda/std/cassert>
 
 #include "test_macros.h"
 
 int main(int, char**)
 {
   // typedef cuda::std::int8_t
-  static_assert(sizeof(cuda::std::int8_t) * CHAR_BIT == 8, "sizeof(cuda::std::int8_t)*CHAR_BIT == 8");
-  static_assert(cuda::std::is_signed<cuda::std::int8_t>::value, "cuda::std::is_signed<cuda::std::int8_t>::value");
+  static_assert(sizeof(cuda::std::int8_t) * CHAR_BIT == 8, "");
+  static_assert(cuda::std::is_signed<cuda::std::int8_t>::value, "");
   // typedef cuda::std::int16_t
-  static_assert(sizeof(cuda::std::int16_t) * CHAR_BIT == 16, "sizeof(cuda::std::int16_t)*CHAR_BIT == 16");
-  static_assert(cuda::std::is_signed<cuda::std::int16_t>::value, "cuda::std::is_signed<cuda::std::int16_t>::value");
+  static_assert(sizeof(cuda::std::int16_t) * CHAR_BIT == 16, "");
+  static_assert(cuda::std::is_signed<cuda::std::int16_t>::value, "");
   // typedef cuda::std::int32_t
-  static_assert(sizeof(cuda::std::int32_t) * CHAR_BIT == 32, "sizeof(cuda::std::int32_t)*CHAR_BIT == 32");
-  static_assert(cuda::std::is_signed<cuda::std::int32_t>::value, "cuda::std::is_signed<cuda::std::int32_t>::value");
+  static_assert(sizeof(cuda::std::int32_t) * CHAR_BIT == 32, "");
+  static_assert(cuda::std::is_signed<cuda::std::int32_t>::value, "");
   // typedef cuda::std::int64_t
-  static_assert(sizeof(cuda::std::int64_t) * CHAR_BIT == 64, "sizeof(cuda::std::int64_t)*CHAR_BIT == 64");
-  static_assert(cuda::std::is_signed<cuda::std::int64_t>::value, "cuda::std::is_signed<cuda::std::int64_t>::value");
+  static_assert(sizeof(cuda::std::int64_t) * CHAR_BIT == 64, "");
+  static_assert(cuda::std::is_signed<cuda::std::int64_t>::value, "");
 
   // typedef cuda::std::uint8_t
-  static_assert(sizeof(cuda::std::uint8_t) * CHAR_BIT == 8, "sizeof(cuda::std::uint8_t)*CHAR_BIT == 8");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint8_t>::value, "cuda::std::is_unsigned<cuda::std::uint8_t>::value");
+  static_assert(sizeof(cuda::std::uint8_t) * CHAR_BIT == 8, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint8_t>::value, "");
   // typedef cuda::std::uint16_t
-  static_assert(sizeof(cuda::std::uint16_t) * CHAR_BIT == 16, "sizeof(cuda::std::uint16_t)*CHAR_BIT == 16");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint16_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint16_t>::value");
+  static_assert(sizeof(cuda::std::uint16_t) * CHAR_BIT == 16, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint16_t>::value, "");
   // typedef cuda::std::uint32_t
-  static_assert(sizeof(cuda::std::uint32_t) * CHAR_BIT == 32, "sizeof(cuda::std::uint32_t)*CHAR_BIT == 32");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint32_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint32_t>::value");
+  static_assert(sizeof(cuda::std::uint32_t) * CHAR_BIT == 32, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint32_t>::value, "");
   // typedef cuda::std::uint64_t
-  static_assert(sizeof(cuda::std::uint64_t) * CHAR_BIT == 64, "sizeof(cuda::std::uint64_t)*CHAR_BIT == 64");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint64_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint64_t>::value");
-
-  // typedef cuda::std::int_least8_t
-  static_assert(sizeof(cuda::std::int_least8_t) * CHAR_BIT >= 8, "sizeof(cuda::std::int_least8_t)*CHAR_BIT >= 8");
-  static_assert(cuda::std::is_signed<cuda::std::int_least8_t>::value,
-                "cuda::std::is_signed<cuda::std::int_least8_t>::value");
-  // typedef cuda::std::int_least16_t
-  static_assert(sizeof(cuda::std::int_least16_t) * CHAR_BIT >= 16, "sizeof(cuda::std::int_least16_t)*CHAR_BIT >= 16");
-  static_assert(cuda::std::is_signed<cuda::std::int_least16_t>::value,
-                "cuda::std::is_signed<cuda::std::int_least16_t>::value");
-  // typedef cuda::std::int_least32_t
-  static_assert(sizeof(cuda::std::int_least32_t) * CHAR_BIT >= 32, "sizeof(cuda::std::int_least32_t)*CHAR_BIT >= 32");
-  static_assert(cuda::std::is_signed<cuda::std::int_least32_t>::value,
-                "cuda::std::is_signed<cuda::std::int_least32_t>::value");
-  // typedef cuda::std::int_least64_t
-  static_assert(sizeof(cuda::std::int_least64_t) * CHAR_BIT >= 64, "sizeof(cuda::std::int_least64_t)*CHAR_BIT >= 64");
-  static_assert(cuda::std::is_signed<cuda::std::int_least64_t>::value,
-                "cuda::std::is_signed<cuda::std::int_least64_t>::value");
-
-  // typedef cuda::std::uint_least8_t
-  static_assert(sizeof(cuda::std::uint_least8_t) * CHAR_BIT >= 8, "sizeof(cuda::std::uint_least8_t)*CHAR_BIT >= 8");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_least8_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_least8_t>::value");
-  // typedef cuda::std::uint_least16_t
-  static_assert(sizeof(cuda::std::uint_least16_t) * CHAR_BIT >= 16, "sizeof(cuda::std::uint_least16_t)*CHAR_BIT >= 16");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_least16_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_least16_t>::value");
-  // typedef cuda::std::uint_least32_t
-  static_assert(sizeof(cuda::std::uint_least32_t) * CHAR_BIT >= 32, "sizeof(cuda::std::uint_least32_t)*CHAR_BIT >= 32");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_least32_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_least32_t>::value");
-  // typedef cuda::std::uint_least64_t
-  static_assert(sizeof(cuda::std::uint_least64_t) * CHAR_BIT >= 64, "sizeof(cuda::std::uint_least64_t)*CHAR_BIT >= 64");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_least64_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_least64_t>::value");
+  static_assert(sizeof(cuda::std::uint64_t) * CHAR_BIT == 64, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint64_t>::value, "");
 
   // typedef cuda::std::int_fast8_t
-  static_assert(sizeof(cuda::std::int_fast8_t) * CHAR_BIT >= 8, "sizeof(cuda::std::int_fast8_t)*CHAR_BIT >= 8");
-  static_assert(cuda::std::is_signed<cuda::std::int_fast8_t>::value,
-                "cuda::std::is_signed<cuda::std::int_fast8_t>::value");
+  static_assert(sizeof(cuda::std::int_fast8_t) * CHAR_BIT >= 8, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_fast8_t>::value, "");
   // typedef cuda::std::int_fast16_t
-  static_assert(sizeof(cuda::std::int_fast16_t) * CHAR_BIT >= 16, "sizeof(cuda::std::int_fast16_t)*CHAR_BIT >= 16");
-  static_assert(cuda::std::is_signed<cuda::std::int_fast16_t>::value,
-                "cuda::std::is_signed<cuda::std::int_fast16_t>::value");
+  static_assert(sizeof(cuda::std::int_fast16_t) * CHAR_BIT >= 16, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_fast16_t>::value, "");
   // typedef cuda::std::int_fast32_t
-  static_assert(sizeof(cuda::std::int_fast32_t) * CHAR_BIT >= 32, "sizeof(cuda::std::int_fast32_t)*CHAR_BIT >= 32");
-  static_assert(cuda::std::is_signed<cuda::std::int_fast32_t>::value,
-                "cuda::std::is_signed<cuda::std::int_fast32_t>::value");
+  static_assert(sizeof(cuda::std::int_fast32_t) * CHAR_BIT >= 32, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_fast32_t>::value, "");
   // typedef cuda::std::int_fast64_t
-  static_assert(sizeof(cuda::std::int_fast64_t) * CHAR_BIT >= 64, "sizeof(cuda::std::int_fast64_t)*CHAR_BIT >= 64");
-  static_assert(cuda::std::is_signed<cuda::std::int_fast64_t>::value,
-                "cuda::std::is_signed<cuda::std::int_fast64_t>::value");
+  static_assert(sizeof(cuda::std::int_fast64_t) * CHAR_BIT >= 64, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_fast64_t>::value, "");
 
   // typedef cuda::std::uint_fast8_t
-  static_assert(sizeof(cuda::std::uint_fast8_t) * CHAR_BIT >= 8, "sizeof(cuda::std::uint_fast8_t)*CHAR_BIT >= 8");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast8_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_fast8_t>::value");
+  static_assert(sizeof(cuda::std::uint_fast8_t) * CHAR_BIT >= 8, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast8_t>::value, "");
   // typedef cuda::std::uint_fast16_t
-  static_assert(sizeof(cuda::std::uint_fast16_t) * CHAR_BIT >= 16, "sizeof(cuda::std::uint_fast16_t)*CHAR_BIT >= 16");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast16_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_fast16_t>::value");
+  static_assert(sizeof(cuda::std::uint_fast16_t) * CHAR_BIT >= 16, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast16_t>::value, "");
   // typedef cuda::std::uint_fast32_t
-  static_assert(sizeof(cuda::std::uint_fast32_t) * CHAR_BIT >= 32, "sizeof(cuda::std::uint_fast32_t)*CHAR_BIT >= 32");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast32_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_fast32_t>::value");
+  static_assert(sizeof(cuda::std::uint_fast32_t) * CHAR_BIT >= 32, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast32_t>::value, "");
   // typedef cuda::std::uint_fast64_t
-  static_assert(sizeof(cuda::std::uint_fast64_t) * CHAR_BIT >= 64, "sizeof(cuda::std::uint_fast64_t)*CHAR_BIT >= 64");
-  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast64_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uint_fast64_t>::value");
+  static_assert(sizeof(cuda::std::uint_fast64_t) * CHAR_BIT >= 64, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_fast64_t>::value, "");
+
+  // typedef cuda::std::int_least8_t
+  static_assert(sizeof(cuda::std::int_least8_t) * CHAR_BIT >= 8, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_least8_t>::value, "");
+  // typedef cuda::std::int_least16_t
+  static_assert(sizeof(cuda::std::int_least16_t) * CHAR_BIT >= 16, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_least16_t>::value, "");
+  // typedef cuda::std::int_least32_t
+  static_assert(sizeof(cuda::std::int_least32_t) * CHAR_BIT >= 32, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_least32_t>::value, "");
+  // typedef cuda::std::int_least64_t
+  static_assert(sizeof(cuda::std::int_least64_t) * CHAR_BIT >= 64, "");
+  static_assert(cuda::std::is_signed<cuda::std::int_least64_t>::value, "");
+
+  // typedef cuda::std::uint_least8_t
+  static_assert(sizeof(cuda::std::uint_least8_t) * CHAR_BIT >= 8, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_least8_t>::value, "");
+  // typedef cuda::std::uint_least16_t
+  static_assert(sizeof(cuda::std::uint_least16_t) * CHAR_BIT >= 16, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_least16_t>::value, "");
+  // typedef cuda::std::uint_least32_t
+  static_assert(sizeof(cuda::std::uint_least32_t) * CHAR_BIT >= 32, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_least32_t>::value, "");
+  // typedef cuda::std::uint_least64_t
+  static_assert(sizeof(cuda::std::uint_least64_t) * CHAR_BIT >= 64, "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uint_least64_t>::value, "");
 
   // typedef cuda::std::intptr_t
-  static_assert(sizeof(cuda::std::intptr_t) >= sizeof(void*), "sizeof(cuda::std::intptr_t) >= sizeof(void*)");
-  static_assert(cuda::std::is_signed<cuda::std::intptr_t>::value, "cuda::std::is_signed<cuda::std::intptr_t>::value");
+  static_assert(sizeof(cuda::std::intptr_t) >= sizeof(void*), "");
+  static_assert(cuda::std::is_signed<cuda::std::intptr_t>::value, "");
   // typedef cuda::std::uintptr_t
-  static_assert(sizeof(cuda::std::uintptr_t) >= sizeof(void*), "sizeof(cuda::std::uintptr_t) >= sizeof(void*)");
-  static_assert(cuda::std::is_unsigned<cuda::std::uintptr_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uintptr_t>::value");
+  static_assert(sizeof(cuda::std::uintptr_t) >= sizeof(void*), "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uintptr_t>::value, "");
 
   // typedef cuda::std::intmax_t
-  static_assert(sizeof(cuda::std::intmax_t) >= sizeof(long long), "sizeof(cuda::std::intmax_t) >= sizeof(long long)");
-  static_assert(cuda::std::is_signed<cuda::std::intmax_t>::value, "cuda::std::is_signed<cuda::std::intmax_t>::value");
+  static_assert(sizeof(cuda::std::intmax_t) >= sizeof(long long), "");
+  static_assert(cuda::std::is_signed<cuda::std::intmax_t>::value, "");
   // typedef cuda::std::uintmax_t
-  static_assert(sizeof(cuda::std::uintmax_t) >= sizeof(unsigned long long),
-                "sizeof(cuda::std::uintmax_t) >= sizeof(unsigned long long)");
-  static_assert(cuda::std::is_unsigned<cuda::std::uintmax_t>::value,
-                "cuda::std::is_unsigned<cuda::std::uintmax_t>::value");
+  static_assert(sizeof(cuda::std::uintmax_t) >= sizeof(unsigned long long), "");
+  static_assert(cuda::std::is_unsigned<cuda::std::uintmax_t>::value, "");
 
   // INTN_MIN
-  static_assert(INT8_MIN == -128, "INT8_MIN == -128");
-  static_assert(INT16_MIN == -32768, "INT16_MIN == -32768");
-  static_assert(INT32_MIN == -2147483647 - 1, "INT32_MIN == -2147483648");
-  static_assert(INT64_MIN == -9223372036854775807LL - 1, "INT64_MIN == -9223372036854775808LL");
+  static_assert(INT8_MIN == -128, "");
+  static_assert(INT16_MIN == -32768, "");
+  static_assert(INT32_MIN == -2147483647 - 1, "");
+  static_assert(INT64_MIN == -9223372036854775807LL - 1, "");
 
   // INTN_MAX
-  static_assert(INT8_MAX == 127, "INT8_MAX == 127");
-  static_assert(INT16_MAX == 32767, "INT16_MAX == 32767");
-  static_assert(INT32_MAX == 2147483647, "INT32_MAX == 2147483647");
-  static_assert(INT64_MAX == 9223372036854775807LL, "INT64_MAX == 9223372036854775807LL");
+  static_assert(INT8_MAX == 127, "");
+  static_assert(INT16_MAX == 32767, "");
+  static_assert(INT32_MAX == 2147483647, "");
+  static_assert(INT64_MAX == 9223372036854775807LL, "");
 
   // UINTN_MAX
-  static_assert(UINT8_MAX == 255, "UINT8_MAX == 255");
-  static_assert(UINT16_MAX == 65535, "UINT16_MAX == 65535");
-  static_assert(UINT32_MAX == 4294967295U, "UINT32_MAX == 4294967295");
-  static_assert(UINT64_MAX == 18446744073709551615ULL, "UINT64_MAX == 18446744073709551615ULL");
+  static_assert(UINT8_MAX == 255, "");
+  static_assert(UINT16_MAX == 65535, "");
+  static_assert(UINT32_MAX == 4294967295U, "");
+  static_assert(UINT64_MAX == 18446744073709551615ULL, "");
 
   // INT_FASTN_MIN
-  static_assert(INT_FAST8_MIN <= -128, "INT_FAST8_MIN <= -128");
-  static_assert(INT_FAST16_MIN <= -32768, "INT_FAST16_MIN <= -32768");
-  static_assert(INT_FAST32_MIN <= -2147483647 - 1, "INT_FAST32_MIN <= -2147483648");
-  static_assert(INT_FAST64_MIN <= -9223372036854775807LL - 1, "INT_FAST64_MIN <= -9223372036854775808LL");
+  static_assert(INT_FAST8_MIN <= -128, "");
+  static_assert(INT_FAST16_MIN <= -32768, "");
+  static_assert(INT_FAST32_MIN <= -2147483647 - 1, "");
+  static_assert(INT_FAST64_MIN <= -9223372036854775807LL - 1, "");
 
   // INT_FASTN_MAX
-  static_assert(INT_FAST8_MAX >= 127, "INT_FAST8_MAX >= 127");
-  static_assert(INT_FAST16_MAX >= 32767, "INT_FAST16_MAX >= 32767");
-  static_assert(INT_FAST32_MAX >= 2147483647, "INT_FAST32_MAX >= 2147483647");
-  static_assert(INT_FAST64_MAX >= 9223372036854775807LL, "INT_FAST64_MAX >= 9223372036854775807LL");
+  static_assert(INT_FAST8_MAX >= 127, "");
+  static_assert(INT_FAST16_MAX >= 32767, "");
+  static_assert(INT_FAST32_MAX >= 2147483647, "");
+  static_assert(INT_FAST64_MAX >= 9223372036854775807LL, "");
 
   // UINT_FASTN_MAX
-  static_assert(UINT_FAST8_MAX >= 255, "UINT_FAST8_MAX >= 255");
-  static_assert(UINT_FAST16_MAX >= 65535, "UINT_FAST16_MAX >= 65535");
-  static_assert(UINT_FAST32_MAX >= 4294967295U, "UINT_FAST32_MAX >= 4294967295");
-  static_assert(UINT_FAST64_MAX >= 18446744073709551615ULL, "UINT_FAST64_MAX >= 18446744073709551615ULL");
+  static_assert(UINT_FAST8_MAX >= 255, "");
+  static_assert(UINT_FAST16_MAX >= 65535, "");
+  static_assert(UINT_FAST32_MAX >= 4294967295U, "");
+  static_assert(UINT_FAST64_MAX >= 18446744073709551615ULL, "");
 
-#if 0
-    // INTPTR_MIN
-    assert(INTPTR_MIN == cuda::std::numeric_limits<cuda::std::intptr_t>::min());
-
-    // INTPTR_MAX
-    assert(INTPTR_MAX == cuda::std::numeric_limits<cuda::std::intptr_t>::max());
-
-    // UINTPTR_MAX
-    assert(UINTPTR_MAX == cuda::std::numeric_limits<cuda::std::uintptr_t>::max());
-
-    // INTMAX_MIN
-    assert(INTMAX_MIN == cuda::std::numeric_limits<cuda::std::intmax_t>::min());
+  // INTN_MIN
+  static_assert(INT8_MIN == -128, "");
+  static_assert(INT16_MIN == -32768, "");
+  static_assert(INT32_MIN == -2147483647 - 1, "");
+  static_assert(INT64_MIN == -9223372036854775807LL - 1, "");
 
-    // INTMAX_MAX
-    assert(INTMAX_MAX == cuda::std::numeric_limits<cuda::std::intmax_t>::max());
+  // INTN_MAX
+  static_assert(INT8_MAX == 127, "");
+  static_assert(INT16_MAX == 32767, "");
+  static_assert(INT32_MAX == 2147483647, "");
+  static_assert(INT64_MAX == 9223372036854775807LL, "");
 
-    // UINTMAX_MAX
-    assert(UINTMAX_MAX == cuda::std::numeric_limits<cuda::std::uintmax_t>::max());
+  // UINTN_MAX
+  static_assert(UINT8_MAX == 255, "");
+  static_assert(UINT16_MAX == 65535, "");
+  static_assert(UINT32_MAX == 4294967295U, "");
+  static_assert(UINT64_MAX == 18446744073709551615ULL, "");
 
-    // PTRDIFF_MIN
-    assert(PTRDIFF_MIN == cuda::std::numeric_limits<cuda::std::ptrdiff_t>::min());
+  // INTPTR_MIN
+  static_assert(INTPTR_MIN == cuda::std::numeric_limits<cuda::std::intptr_t>::min(), "");
 
-    // PTRDIFF_MAX
-    assert(PTRDIFF_MAX == cuda::std::numeric_limits<cuda::std::ptrdiff_t>::max());
+  // INTPTR_MAX
+  static_assert(INTPTR_MAX == cuda::std::numeric_limits<cuda::std::intptr_t>::max(), "");
 
-    // SIG_ATOMIC_MIN
-    // assert(SIG_ATOMIC_MIN == cuda::std::numeric_limits<cuda::std::sig_atomic_t>::min());
+  // UINTPTR_MAX
+  static_assert(UINTPTR_MAX == cuda::std::numeric_limits<cuda::std::uintptr_t>::max(), "");
 
-    // SIG_ATOMIC_MAX
-    // assert(SIG_ATOMIC_MAX == cuda::std::numeric_limits<cuda::std::sig_atomic_t>::max());
+  // INTMAX_MIN
+  static_assert(INTMAX_MIN == cuda::std::numeric_limits<cuda::std::intmax_t>::min(), "");
 
-    // SIZE_MAX
-    assert(SIZE_MAX == cuda::std::numeric_limits<cuda::std::size_t>::max());
+  // INTMAX_MAX
+  static_assert(INTMAX_MAX == cuda::std::numeric_limits<cuda::std::intmax_t>::max(), "");
 
-    // WCHAR_MIN
-    // assert(WCHAR_MIN == cuda::std::numeric_limits<wchar_t>::min());
+  // UINTMAX_MAX
+  static_assert(UINTMAX_MAX == cuda::std::numeric_limits<cuda::std::uintmax_t>::max(), "");
 
-    // WCHAR_MAX
-    // assert(WCHAR_MAX == cuda::std::numeric_limits<wchar_t>::max());
+  // PTRDIFF_MIN
+  static_assert(PTRDIFF_MIN == cuda::std::numeric_limits<cuda::std::ptrdiff_t>::min(), "");
 
-    // WINT_MIN
-    // assert(WINT_MIN == cuda::std::numeric_limits<cuda::std::wint_t>::min());
+  // PTRDIFF_MAX
+  static_assert(PTRDIFF_MAX == cuda::std::numeric_limits<cuda::std::ptrdiff_t>::max(), "");
 
-    // WINT_MAX
-    // assert(WINT_MAX == cuda::std::numeric_limits<cuda::std::wint_t>::max());
-#endif
+  // SIZE_MAX
+  static_assert(SIZE_MAX == cuda::std::numeric_limits<cuda::std::size_t>::max(), "");
 
 #ifndef INT8_C
 #  error INT8_C not defined
diff --git a/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memchr.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memchr.pass.cpp
new file mode 100644
index 00000000000..7da56d94f06
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memchr.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cstring>
+#include <cuda/std/type_traits>
+
+constexpr int not_found = -1;
+
+__host__ __device__ void test(const char* ptr, int c, size_t n, int expected_pos)
+{
+  const void* ret = cuda::std::memchr(ptr, c, n);
+
+  if (expected_pos == not_found)
+  {
+    assert(ret == nullptr);
+  }
+  else
+  {
+    assert(ret != nullptr);
+    assert(static_cast<const char*>(ret) == ptr + expected_pos);
+  }
+}
+
+int main(int, char**)
+{
+  test("abcde", '\0', 6, 5);
+  test("abcde", '\0', 5, not_found);
+  test("aaabb", 'b', 5, 3);
+  test("aaabb", 'b', 4, 3);
+  test("aaabb", 'b', 3, not_found);
+  test("aaaa", 'b', 4, not_found);
+  test("aaaa", 'a', 0, not_found);
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memcmp.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memcmp.pass.cpp
new file mode 100644
index 00000000000..bb86703fb30
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memcmp.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// int memcmp(const void* lhs, const void* rhs, size_t count);
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstring>
+
+#include "test_macros.h"
+
+__host__ __device__ void test(const char* lhs, const char* rhs, size_t n, int expected)
+{
+  const auto ret = cuda::std::memcmp(lhs, rhs, n);
+
+  if (expected == 0)
+  {
+    assert(ret == 0);
+  }
+  else if (expected < 0)
+  {
+    assert(ret < 0);
+  }
+  else
+  {
+    assert(ret > 0);
+  }
+}
+
+int main(int, char**)
+{
+  test("abcde", "abcde", 5, 0);
+  test("abcd1", "abcd0", 5, 1);
+  test("abcd0", "abcd1", 5, -1);
+
+  test("abcd1", "abcd0", 4, 0);
+  test("abcd0", "abcd1", 4, 0);
+
+  test("abcde", "fghij", 5, -1);
+  test("abcde", "fghij", 0, 0);
+
+  test(nullptr, nullptr, 0, 0);
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memcpy.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memcpy.pass.cpp
new file mode 100644
index 00000000000..4f6ea5ea4f1
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memcpy.pass.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// void* memcpy(void* dst, const void* src, size_t count);
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstring>
+
+#include "test_macros.h"
+
+template <typename T>
+__host__ __device__ void test(T obj)
+{
+  unsigned char buf[sizeof(T)]{};
+  assert(cuda::std::memcpy(buf, &obj, sizeof(T)) == buf);
+  assert(cuda::std::memcmp(buf, &obj, sizeof(T)) == 0);
+}
+
+struct SmallObj
+{
+  int i;
+};
+
+struct MidObj
+{
+  char c1;
+  char c2;
+  short s;
+  int j;
+  int k;
+};
+
+struct LargeObj
+{
+  double ds[10];
+};
+
+union Union
+{
+  int i;
+  double d;
+};
+
+int main(int, char**)
+{
+  test('a');
+  test(short(2489));
+  test(780581);
+  test(127156178992l);
+  test(129.f);
+  test(20123.003445);
+  test(SmallObj{25});
+  test(MidObj{'a', 'b', 120, 902183, 3124});
+  test(LargeObj{187.0, 0.00000346, 1203980985.4365, 123.567, 0.0, -0.0, 123.567});
+  test(reinterpret_cast<char*>(123456));
+  test(Union{123456});
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memmove.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memmove.pass.cpp
new file mode 100644
index 00000000000..9502026e2cb
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memmove.pass.cpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// void* memmove(void* dst, const void* src, size_t count);
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstring>
+
+#include "test_macros.h"
+
+__host__ __device__ void test_out_of_place()
+{
+  char src[] = "1234567890";
+
+  {
+    char dest[] = "abcdefghij";
+    char ref[]  = "1234567890";
+    assert(cuda::std::memmove(dest, src, 10) == dest);
+    assert(cuda::std::memcmp(dest, ref, 10) == 0);
+  }
+
+  {
+    char dest[] = "abcdefghij";
+    char ref[]  = "abc123456j";
+    assert(cuda::std::memmove(dest + 3, src, 6) == dest + 3);
+    assert(cuda::std::memcmp(dest, ref, 10) == 0);
+  }
+
+  {
+    char dest[] = "abcdefghij";
+    char ref[]  = "56789fghij";
+    assert(cuda::std::memmove(dest, src + 4, 5) == dest);
+    assert(cuda::std::memcmp(dest, ref, 10) == 0);
+  }
+
+  {
+    char dest[] = "abcdefghij";
+    char ref[]  = "ab789fghij";
+    assert(cuda::std::memmove(dest + 2, src + 6, 3) == dest + 2);
+    assert(cuda::std::memcmp(dest, ref, 10) == 0);
+  }
+}
+
+__host__ __device__ void test_in_place()
+{
+  {
+    char buf[] = "1234567890";
+    char ref[] = "1234567890";
+    assert(cuda::std::memmove(buf, buf, 10) == buf);
+    assert(cuda::std::memcmp(buf, ref, 10) == 0);
+  }
+
+  {
+    char buf[] = "1234567890";
+    char ref[] = "1231234567";
+    assert(cuda::std::memmove(buf + 3, buf, 7) == buf + 3);
+    assert(cuda::std::memcmp(buf, ref, 10) == 0);
+  }
+
+  {
+    char buf[] = "1234567890";
+    char ref[] = "5678967890";
+    assert(cuda::std::memmove(buf, buf + 4, 5) == buf);
+    assert(cuda::std::memcmp(buf, ref, 10) == 0);
+  }
+
+  {
+    char buf[] = "1234567890";
+    char ref[] = "1234897890";
+    assert(cuda::std::memmove(buf + 4, buf + 7, 2) == buf + 4);
+    assert(cuda::std::memcmp(buf, ref, 10) == 0);
+  }
+}
+
+int main(int, char**)
+{
+  test_out_of_place();
+  test_in_place();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memset.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memset.pass.cpp
new file mode 100644
index 00000000000..349fb386df4
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/language.support/cstring/cstring.array_func/memset.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// void* memset(void* s, int c, size_t n);
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstring>
+
+#include "test_macros.h"
+
+template <typename T>
+__host__ __device__ void test(int c)
+{
+  T obj{};
+  assert(cuda::std::memset(&obj, c, sizeof(T)) == &obj);
+  assert(cuda::std::memcmp(&obj, &obj, sizeof(T)) == 0);
+}
+
+struct SmallObj
+{
+  int i;
+};
+
+struct MidObj
+{
+  char i;
+  int j;
+  int k;
+};
+
+struct LargeObj
+{
+  short j;
+  double ds[10];
+};
+
+int main(int, char**)
+{
+  test<char>(0);
+  test<short>(255);
+  test<int>(78);
+  test<long>(127);
+  test<float>(129);
+  test<double>(200);
+  test<SmallObj>(25);
+  test<MidObj>(100);
+  test<LargeObj>(187);
+  test<void*>(0);
+  test<int[10]>(2);
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
index 2ecd59004bb..7113c0e2772 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
@@ -68,6 +68,13 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
+
   static_assert(!cuda::std::numeric_limits<cuda::std::complex<double>>::is_specialized,
                 "!cuda::std::numeric_limits<cuda::std::complex<double> >::is_specialized");
 
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
new file mode 100644
index 00000000000..15b48836839
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NUMERIC_LIMITS_MEMBERS_COMMON_H
+#define NUMERIC_LIMITS_MEMBERS_COMMON_H
+
+// Disable all the extended floating point operations and conversions
+#define __CUDA_NO_HALF_CONVERSIONS__     1
+#define __CUDA_NO_HALF_OPERATORS__       1
+#define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1
+#define __CUDA_NO_BFLOAT16_OPERATORS__   1
+
+#include <cuda/std/limits>
+
+template <class T>
+__host__ __device__ bool float_eq(T x, T y)
+{
+  return x == y;
+}
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+__host__ __device__ inline bool float_eq(__half x, __half y)
+{
+  return __heq(x, y);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+__host__ __device__ inline bool float_eq(__nv_bfloat16 x, __nv_bfloat16 y)
+{
+  return __heq(x, y);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+#endif // NUMERIC_LIMITS_MEMBERS_COMMON_H
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
index 8db1a9f5f0c..769080cff83 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
@@ -42,173 +42,80 @@ template <class T>
 __host__ __device__ void test(T)
 {}
 
-#define TEST_NUMERIC_LIMITS(type)                           \
-  test(cuda::std::numeric_limits<type>::is_specialized);    \
-  test(cuda::std::numeric_limits<type>::digits);            \
-  test(cuda::std::numeric_limits<type>::digits10);          \
-  test(cuda::std::numeric_limits<type>::max_digits10);      \
-  test(cuda::std::numeric_limits<type>::is_signed);         \
-  test(cuda::std::numeric_limits<type>::is_integer);        \
-  test(cuda::std::numeric_limits<type>::is_exact);          \
-  test(cuda::std::numeric_limits<type>::radix);             \
-  test(cuda::std::numeric_limits<type>::min_exponent);      \
-  test(cuda::std::numeric_limits<type>::min_exponent10);    \
-  test(cuda::std::numeric_limits<type>::max_exponent);      \
-  test(cuda::std::numeric_limits<type>::max_exponent10);    \
-  test(cuda::std::numeric_limits<type>::has_infinity);      \
-  test(cuda::std::numeric_limits<type>::has_quiet_NaN);     \
-  test(cuda::std::numeric_limits<type>::has_signaling_NaN); \
-  test(cuda::std::numeric_limits<type>::has_denorm);        \
-  test(cuda::std::numeric_limits<type>::has_denorm_loss);   \
-  test(cuda::std::numeric_limits<type>::is_iec559);         \
-  test(cuda::std::numeric_limits<type>::is_bounded);        \
-  test(cuda::std::numeric_limits<type>::is_modulo);         \
-  test(cuda::std::numeric_limits<type>::traps);             \
-  test(cuda::std::numeric_limits<type>::tinyness_before);   \
-  test(cuda::std::numeric_limits<type>::round_style);
+template <class T>
+__host__ __device__ void test_type_helper()
+{
+  test(cuda::std::numeric_limits<T>::is_specialized);
+  test(cuda::std::numeric_limits<T>::digits);
+  test(cuda::std::numeric_limits<T>::digits10);
+  test(cuda::std::numeric_limits<T>::max_digits10);
+  test(cuda::std::numeric_limits<T>::is_signed);
+  test(cuda::std::numeric_limits<T>::is_integer);
+  test(cuda::std::numeric_limits<T>::is_exact);
+  test(cuda::std::numeric_limits<T>::radix);
+  test(cuda::std::numeric_limits<T>::min_exponent);
+  test(cuda::std::numeric_limits<T>::min_exponent10);
+  test(cuda::std::numeric_limits<T>::max_exponent);
+  test(cuda::std::numeric_limits<T>::max_exponent10);
+  test(cuda::std::numeric_limits<T>::has_infinity);
+  test(cuda::std::numeric_limits<T>::has_quiet_NaN);
+  test(cuda::std::numeric_limits<T>::has_signaling_NaN);
+  test(cuda::std::numeric_limits<T>::has_denorm);
+  test(cuda::std::numeric_limits<T>::has_denorm_loss);
+  test(cuda::std::numeric_limits<T>::is_iec559);
+  test(cuda::std::numeric_limits<T>::is_bounded);
+  test(cuda::std::numeric_limits<T>::is_modulo);
+  test(cuda::std::numeric_limits<T>::traps);
+  test(cuda::std::numeric_limits<T>::tinyness_before);
+  test(cuda::std::numeric_limits<T>::round_style);
+}
+
+template <class T>
+__host__ __device__ void test_type()
+{
+  test_type_helper<T>();
+  test_type_helper<const T>();
+  test_type_helper<volatile T>();
+  test_type_helper<const volatile T>();
+}
 
 struct other
 {};
 
 int main(int, char**)
 {
-  // bool
-  TEST_NUMERIC_LIMITS(bool)
-  TEST_NUMERIC_LIMITS(const bool)
-  TEST_NUMERIC_LIMITS(volatile bool)
-  TEST_NUMERIC_LIMITS(const volatile bool)
-
-  // char
-  TEST_NUMERIC_LIMITS(char)
-  TEST_NUMERIC_LIMITS(const char)
-  TEST_NUMERIC_LIMITS(volatile char)
-  TEST_NUMERIC_LIMITS(const volatile char)
-
-  // signed char
-  TEST_NUMERIC_LIMITS(signed char)
-  TEST_NUMERIC_LIMITS(const signed char)
-  TEST_NUMERIC_LIMITS(volatile signed char)
-  TEST_NUMERIC_LIMITS(const volatile signed char)
-
-  // unsigned char
-  TEST_NUMERIC_LIMITS(unsigned char)
-  TEST_NUMERIC_LIMITS(const unsigned char)
-  TEST_NUMERIC_LIMITS(volatile unsigned char)
-  TEST_NUMERIC_LIMITS(const volatile unsigned char)
-
-  // wchar_t
-  TEST_NUMERIC_LIMITS(wchar_t)
-  TEST_NUMERIC_LIMITS(const wchar_t)
-  TEST_NUMERIC_LIMITS(volatile wchar_t)
-  TEST_NUMERIC_LIMITS(const volatile wchar_t)
-
-#if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  // char8_t
-  TEST_NUMERIC_LIMITS(char8_t)
-  TEST_NUMERIC_LIMITS(const char8_t)
-  TEST_NUMERIC_LIMITS(volatile char8_t)
-  TEST_NUMERIC_LIMITS(const volatile char8_t)
-#endif
-
-  // char16_t
-  TEST_NUMERIC_LIMITS(char16_t)
-  TEST_NUMERIC_LIMITS(const char16_t)
-  TEST_NUMERIC_LIMITS(volatile char16_t)
-  TEST_NUMERIC_LIMITS(const volatile char16_t)
-
-  // char32_t
-  TEST_NUMERIC_LIMITS(char32_t)
-  TEST_NUMERIC_LIMITS(const char32_t)
-  TEST_NUMERIC_LIMITS(volatile char32_t)
-  TEST_NUMERIC_LIMITS(const volatile char32_t)
-
-  // short
-  TEST_NUMERIC_LIMITS(short)
-  TEST_NUMERIC_LIMITS(const short)
-  TEST_NUMERIC_LIMITS(volatile short)
-  TEST_NUMERIC_LIMITS(const volatile short)
-
-  // int
-  TEST_NUMERIC_LIMITS(int)
-  TEST_NUMERIC_LIMITS(const int)
-  TEST_NUMERIC_LIMITS(volatile int)
-  TEST_NUMERIC_LIMITS(const volatile int)
-
-  // long
-  TEST_NUMERIC_LIMITS(long)
-  TEST_NUMERIC_LIMITS(const long)
-  TEST_NUMERIC_LIMITS(volatile long)
-  TEST_NUMERIC_LIMITS(const volatile long)
-
-#ifndef _LIBCUDACXX_HAS_NO_INT128
-  TEST_NUMERIC_LIMITS(__int128_t)
-  TEST_NUMERIC_LIMITS(const __int128_t)
-  TEST_NUMERIC_LIMITS(volatile __int128_t)
-  TEST_NUMERIC_LIMITS(const volatile __int128_t)
-#endif
-
-  // long long
-  TEST_NUMERIC_LIMITS(long long)
-  TEST_NUMERIC_LIMITS(const long long)
-  TEST_NUMERIC_LIMITS(volatile long long)
-  TEST_NUMERIC_LIMITS(const volatile long long)
-
-  // unsigned short
-  TEST_NUMERIC_LIMITS(unsigned short)
-  TEST_NUMERIC_LIMITS(const unsigned short)
-  TEST_NUMERIC_LIMITS(volatile unsigned short)
-  TEST_NUMERIC_LIMITS(const volatile unsigned short)
-
-  // unsigned int
-  TEST_NUMERIC_LIMITS(unsigned int)
-  TEST_NUMERIC_LIMITS(const unsigned int)
-  TEST_NUMERIC_LIMITS(volatile unsigned int)
-  TEST_NUMERIC_LIMITS(const volatile unsigned int)
-
-  // unsigned long
-  TEST_NUMERIC_LIMITS(unsigned long)
-  TEST_NUMERIC_LIMITS(const unsigned long)
-  TEST_NUMERIC_LIMITS(volatile unsigned long)
-  TEST_NUMERIC_LIMITS(const volatile unsigned long)
-
-  // unsigned long long
-  TEST_NUMERIC_LIMITS(unsigned long long)
-  TEST_NUMERIC_LIMITS(const unsigned long long)
-  TEST_NUMERIC_LIMITS(volatile unsigned long long)
-  TEST_NUMERIC_LIMITS(const volatile unsigned long long)
-
+  test_type<bool>();
+  test_type<char>();
+  test_type<signed char>();
+  test_type<unsigned char>();
+  test_type<wchar_t>();
+#if TEST_STD_VER >= 2020 && defined(__cpp_char8_t)
+  test_type<char8_t>();
+#endif // TEST_STD_VER >= 2020 && defined(__cpp_char8_t)
+  test_type<char16_t>();
+  test_type<char32_t>();
+  test_type<short>();
+  test_type<unsigned short>();
+  test_type<int>();
+  test_type<unsigned int>();
+  test_type<long>();
+  test_type<unsigned long>();
+  test_type<long long>();
+  test_type<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  TEST_NUMERIC_LIMITS(__uint128_t)
-  TEST_NUMERIC_LIMITS(const __uint128_t)
-  TEST_NUMERIC_LIMITS(volatile __uint128_t)
-  TEST_NUMERIC_LIMITS(const volatile __uint128_t)
-#endif
-
-  // float
-  TEST_NUMERIC_LIMITS(float)
-  TEST_NUMERIC_LIMITS(const float)
-  TEST_NUMERIC_LIMITS(volatile float)
-  TEST_NUMERIC_LIMITS(const volatile float)
-
-  // double
-  TEST_NUMERIC_LIMITS(double)
-  TEST_NUMERIC_LIMITS(const double)
-  TEST_NUMERIC_LIMITS(volatile double)
-  TEST_NUMERIC_LIMITS(const volatile double)
-
+  test_type<__int128_t>();
+#endif // _LIBCUDACXX_HAS_NO_INT128
+  test_type<float>();
+  test_type<double>();
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-  // long double
-  TEST_NUMERIC_LIMITS(long double)
-  TEST_NUMERIC_LIMITS(const long double)
-  TEST_NUMERIC_LIMITS(volatile long double)
-  TEST_NUMERIC_LIMITS(const volatile long double)
-#endif
-
-  // other
-  TEST_NUMERIC_LIMITS(other)
-  TEST_NUMERIC_LIMITS(const other)
-  TEST_NUMERIC_LIMITS(volatile other)
-  TEST_NUMERIC_LIMITS(const volatile other)
+  test_type<long double>();
+#endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test_type<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test_type<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
index 730adc30d36..cc64ed14686 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
@@ -14,15 +14,16 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::denorm_min() == expected);
-  assert(cuda::std::numeric_limits<const T>::denorm_min() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::denorm_min() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::denorm_min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::denorm_min(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::denorm_min(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::denorm_min(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::denorm_min(), expected));
 }
 
 int main(int, char**)
@@ -65,6 +66,12 @@ int main(int, char**)
   test<long double>(LDBL_TRUE_MIN);
 #  endif
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(5.9604644775390625e-08));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41));
+#endif // _LIBCUDACXX_HAS_NVBF16
 #if !defined(__FLT_DENORM_MIN__) && !defined(FLT_TRUE_MIN)
 #  error Test has no expected values for floating point types
 #endif
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
index 63ecf93515f..efce1ccf678 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
@@ -55,6 +55,11 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MANT_DIG>();
 #endif
-
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 11>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 8>();
+#endif // _LIBCUDACXX_HAS_NVBF16
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
index 3295686ea49..32990ece4b1 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
@@ -59,6 +59,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_DIG>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 3>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 2>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
index 5bc22e7f5f2..fa42c5e8fe6 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
@@ -14,15 +14,16 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::epsilon() == expected);
-  assert(cuda::std::numeric_limits<const T>::epsilon() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::epsilon() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::epsilon() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::epsilon(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::epsilon(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::epsilon(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::epsilon(), expected));
 }
 
 int main(int, char**)
@@ -56,6 +57,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_EPSILON);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(0.0009765625));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(0.0078125));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
index e62208d7e3b..5a0a05ab73b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, cuda::std::denorm_present>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, cuda::std::denorm_present>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, cuda::std::denorm_present>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
index 3a665fe2c9b..450e51b8111 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
index be41dabb02c..646f5e20160 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
index 2d13db35438..626b4110695 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
index d5cf5096bb7..20cd04d107e 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
index 2d1c29f6f31..34527e300c5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
+
 // MSVC has issues with producing INF with divisions by zero.
 #if defined(_MSC_VER)
 #  include <cmath>
@@ -24,10 +26,10 @@
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::infinity() == expected);
-  assert(cuda::std::numeric_limits<const T>::infinity() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::infinity() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::infinity() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::infinity(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::infinity(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::infinity(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::infinity(), expected));
 }
 
 int main(int, char**)
@@ -62,6 +64,12 @@ int main(int, char**)
 #  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(1. / 0.);
 #  endif
+#  if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(1.0 / 0.0));
+#  endif // _LIBCUDACXX_HAS_NVFP16
+#  if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0));
+#  endif // _LIBCUDACXX_HAS_NVBF16
 // MSVC has issues with producing INF with divisions by zero.
 #else
   test<float>(INFINITY);
@@ -69,11 +77,13 @@ int main(int, char**)
 #  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(INFINITY);
 #  endif
+#  if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(INFINITY));
+#  endif // _LIBCUDACXX_HAS_NVFP16
+#  if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(INFINITY));
+#  endif // _LIBCUDACXX_HAS_NVBF16
 #endif
 
   return 0;
 }
-
-#ifndef TEST_COMPILER_NVRTC
-float zero = 0;
-#endif
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
index 2dd4bd94fbc..9e671c5d905 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
index be45efae70c..cfc9a6cab90 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
index 6221cd6ed59..945347ff4b5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
index 3d166f31f28..65dd98fdb04 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
index 7b1adabf0c7..6d82269e1c8 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
index d7f98766343..eb39869bf24 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
index 6fec93e4a3d..e3b832dfd9b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
@@ -16,18 +16,19 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::lowest(), expected));
   assert(cuda::std::numeric_limits<T>::is_bounded);
-  assert(cuda::std::numeric_limits<const T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const T>::lowest(), expected));
   assert(cuda::std::numeric_limits<const T>::is_bounded);
-  assert(cuda::std::numeric_limits<volatile T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::lowest(), expected));
   assert(cuda::std::numeric_limits<volatile T>::is_bounded);
-  assert(cuda::std::numeric_limits<const volatile T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::lowest(), expected));
   assert(cuda::std::numeric_limits<const volatile T>::is_bounded);
 }
 
@@ -35,6 +36,7 @@ int main(int, char**)
 {
   test<bool>(false);
   test<char>(CHAR_MIN);
+
   test<signed char>(SCHAR_MIN);
   test<unsigned char>(0);
 #ifndef TEST_COMPILER_NVRTC
@@ -64,6 +66,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(-LDBL_MAX);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(-65504.0));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
index 67c94051729..7ba6dabb1d2 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
@@ -16,18 +16,19 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::max(), expected));
   assert(cuda::std::numeric_limits<T>::is_bounded);
-  assert(cuda::std::numeric_limits<const T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const T>::max(), expected));
   assert(cuda::std::numeric_limits<const T>::is_bounded);
-  assert(cuda::std::numeric_limits<volatile T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::max(), expected));
   assert(cuda::std::numeric_limits<volatile T>::is_bounded);
-  assert(cuda::std::numeric_limits<const volatile T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::max(), expected));
   assert(cuda::std::numeric_limits<const volatile T>::is_bounded);
 }
 
@@ -64,6 +65,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_MAX);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(65504.0));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
index cd5892e6c8c..92b3d13ea61 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, 2 + (LDBL_MANT_DIG * 30103) / 100000>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 5>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 4>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
index aeb9189d315..81d5ae07795 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MAX_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 16>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 128>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
index ca0eb2917f6..4c426b37460 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MAX_10_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 4>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 38>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
index 53d196d2a51..c24c3fde869 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
@@ -16,18 +16,19 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::min(), expected));
   assert(cuda::std::numeric_limits<T>::is_bounded || !cuda::std::numeric_limits<T>::is_signed);
-  assert(cuda::std::numeric_limits<const T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const T>::min(), expected));
   assert(cuda::std::numeric_limits<const T>::is_bounded || !cuda::std::numeric_limits<const T>::is_signed);
-  assert(cuda::std::numeric_limits<volatile T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::min(), expected));
   assert(cuda::std::numeric_limits<volatile T>::is_bounded || !cuda::std::numeric_limits<volatile T>::is_signed);
-  assert(cuda::std::numeric_limits<const volatile T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::min(), expected));
   assert(cuda::std::numeric_limits<const volatile T>::is_bounded
          || !cuda::std::numeric_limits<const volatile T>::is_signed);
 }
@@ -65,6 +66,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_MIN);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(6.103515625e-05));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
index b075bcff87d..e3150f8dc8e 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MIN_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, -13>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, -125>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
index c787cf4caab..cbca8e04171 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MIN_10_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, -4>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, -37>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
index ce38b3ed60d..74e7f427941 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
@@ -38,7 +38,9 @@ __host__ __device__ void test_imp(cuda::std::false_type)
 template <class T>
 __host__ __device__ inline void test()
 {
-  test_imp<T>(cuda::std::is_floating_point<T>());
+  constexpr bool is_float = cuda::std::is_floating_point<T>::value || cuda::std::__is_extended_floating_point<T>::value;
+
+  test_imp<T>(cuda::std::integral_constant<bool, is_float>{});
 }
 
 int main(int, char**)
@@ -72,6 +74,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
index 5a709b3aefc..9765db6f760 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, FLT_RADIX>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, FLT_RADIX>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, FLT_RADIX>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
index 01d10e80fb9..ba5049fc49f 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
@@ -14,15 +14,16 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::round_error() == expected);
-  assert(cuda::std::numeric_limits<const T>::round_error() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::round_error() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::round_error() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::round_error(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::round_error(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::round_error(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::round_error(), expected));
 }
 
 int main(int, char**)
@@ -56,6 +57,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(0.5);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(0.5));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(0.5));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
index 7a7099662f0..3fb436381a7 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, cuda::std::round_to_nearest>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, cuda::std::round_to_nearest>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, cuda::std::round_to_nearest>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
index 164d54c5741..69ba66038de 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
@@ -38,7 +38,9 @@ __host__ __device__ void test_imp(cuda::std::false_type)
 template <class T>
 __host__ __device__ inline void test()
 {
-  test_imp<T>(cuda::std::is_floating_point<T>());
+  constexpr bool is_float = cuda::std::is_floating_point<T>::value || cuda::std::__is_extended_floating_point<T>::value;
+
+  test_imp<T>(cuda::std::integral_constant<bool, is_float>{});
 }
 
 int main(int, char**)
@@ -72,6 +74,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
index 70d832dc547..70cde2711a1 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
index 5c66acb56ce..7dd7eee68cc 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
@@ -60,6 +60,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/linalg/accessors/conjugate_transposed.pass.cpp b/libcudacxx/test/libcudacxx/std/linalg/accessors/conjugate_transposed.pass.cpp
new file mode 100644
index 00000000000..72576f486c9
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/linalg/accessors/conjugate_transposed.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+// UNSUPPORTED: msvc && c++17
+
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+#include <cuda/std/linalg>
+#include <cuda/std/mdspan>
+#include <cuda/std/type_traits>
+
+int main(int, char**)
+{
+  using dynamic_extents = cuda::std::dextents<size_t, 2>;
+  using complex_t       = cuda::std::complex<float>;
+  {
+    cuda::std::array<complex_t, 6> d{
+      complex_t{2.f, 3.f},
+      complex_t{4.f, 5.f},
+      complex_t{6.f, 7.f},
+      complex_t{8.f, 9.f},
+      complex_t{10.f, 11.f},
+      complex_t{12.f, 13.f},
+    };
+    //     42, 43, 44
+    //     45, 46, 47
+    cuda::std::mdspan<complex_t, dynamic_extents> md(d.data(), 2, 3);
+    auto conj_transposed_md = cuda::std::linalg::conjugate_transposed(md);
+
+    assert(conj_transposed_md.static_extent(0) == cuda::std::dynamic_extent);
+    assert(conj_transposed_md.static_extent(1) == cuda::std::dynamic_extent);
+    assert(conj_transposed_md.extent(0) == 3);
+    assert(conj_transposed_md.extent(1) == 2);
+    assert(cuda::std::conj(md(0, 0)) == conj_transposed_md(0, 0));
+    assert(cuda::std::conj(md(0, 1)) == conj_transposed_md(1, 0));
+    assert(cuda::std::conj(md(0, 2)) == conj_transposed_md(2, 0));
+    assert(cuda::std::conj(md(1, 0)) == conj_transposed_md(0, 1));
+    assert(cuda::std::conj(md(1, 1)) == conj_transposed_md(1, 1));
+    assert(cuda::std::conj(md(1, 2)) == conj_transposed_md(2, 1));
+  }
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/linalg/accessors/conjugated.pass.cpp b/libcudacxx/test/libcudacxx/std/linalg/accessors/conjugated.pass.cpp
new file mode 100644
index 00000000000..16969302581
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/linalg/accessors/conjugated.pass.cpp
@@ -0,0 +1,160 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+// UNSUPPORTED: msvc && c++17
+
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+#include <cuda/std/linalg>
+#include <cuda/std/type_traits>
+
+__host__ __device__ void constexpr_test()
+{
+  using E = cuda::std::extents<size_t, 2>;
+  // operator() arithmetic type
+  {
+    using T = float;
+    cuda::std::array<T, 2> d{42.f, 43.f};
+    cuda::std::mdspan<T, E> md(d.data(), E{});
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    static_assert(cuda::std::is_same<decltype(+conj_md(0)), T>::value, "wrong type");
+    static_cast<void>(conj_md);
+  }
+  // operator() complex type
+  {
+    using complex_t = cuda::std::complex<float>;
+    cuda::std::array<complex_t, 2> d{complex_t{}, complex_t{}};
+    cuda::std::mdspan<complex_t, E> md(d.data(), E{});
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    static_assert(cuda::std::is_same<decltype(conj_md(0)), complex_t>::value, "wrong type");
+    static_cast<void>(conj_md);
+  }
+  // nested_accessor() arithmetic type
+  {
+    using T = int;
+    cuda::std::array<T, 2> d{42, 43};
+    cuda::std::mdspan<T, E> md(d.data(), E{});
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    static_assert(
+      cuda::std::is_same<cuda::std::decay_t<decltype(conj_md.accessor())>, cuda::std::default_accessor<T>>::value,
+      "wrong type");
+    static_cast<void>(conj_md);
+  }
+  // nested_accessor() complex type
+  {
+    using complex_t = cuda::std::complex<float>;
+    cuda::std::array<complex_t, 2> d{complex_t{}, complex_t{}};
+    cuda::std::mdspan<complex_t, E> md(d.data(), E{});
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    static_assert(cuda::std::is_same<cuda::std::decay_t<decltype(conj_md.accessor().nested_accessor())>,
+                                     cuda::std::default_accessor<complex_t>>::value,
+                  "wrong type");
+    static_cast<void>(conj_md);
+  }
+}
+
+__host__ __device__ void runtime_test()
+{
+  using E = cuda::std::extents<size_t, 2>;
+  // operator() float value
+  {
+    using T = float;
+    cuda::std::array<T, 2> d{42.f, 43.f};
+    cuda::std::mdspan<T, E> md(d.data(), 2);
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    assert(conj_md(0) == 42.f);
+    assert(conj_md(1) == 43.f);
+  }
+  // operator() complex value
+  {
+    using complex_t = cuda::std::complex<float>;
+    cuda::std::array<complex_t, 2> d{complex_t{42.f, 2.f}, complex_t{43.f, 3.f}};
+    cuda::std::mdspan<complex_t, E> md(d.data(), 2);
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    assert((conj_md(0) == complex_t{42.f, -2.f}));
+    assert((conj_md(1) == complex_t{43.f, -3.f}));
+  }
+  // operator() integer value
+  {
+    using T = int;
+    cuda::std::array<T, 2> d{42, 43};
+    cuda::std::mdspan<T, E> md(d.data(), 2);
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    assert(conj_md(0) == 42);
+    assert(conj_md(1) == 43);
+  }
+  // operator() custom type
+  {
+    struct A
+    {
+      int x;
+    };
+    cuda::std::array<A, 2> d{A{42}, A{43}};
+    cuda::std::mdspan<A, E> md(d.data(), 2);
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    assert(conj_md(0).x == 42);
+    assert(conj_md(1).x == 43);
+  }
+  // access()
+  {
+    using T = float;
+    cuda::std::array<T, 2> d{42.f, 43.f};
+    cuda::std::mdspan<T, E> md(d.data(), 2);
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    assert(conj_md.accessor().access(d.data(), 1) == 43.f);
+  }
+  // offset()
+  {
+    using T = float;
+    cuda::std::array<T, 2> d{42.f, 43.f};
+    cuda::std::mdspan<T, E> md(d.data(), 2);
+    auto conj_md = cuda::std::linalg::conjugated(md);
+
+    assert(conj_md.accessor().offset(d.data(), 1) == d.data() + 1);
+  }
+  // composition
+  {
+    using complex_t = cuda::std::complex<float>;
+    cuda::std::array<complex_t, 2> d{complex_t{42.f, 2.f}, complex_t{43.f, 3.f}};
+    cuda::std::mdspan<complex_t, E> md(d.data(), 2);
+    auto conj_md1 = cuda::std::linalg::conjugated(md);
+    auto conj_md2 = cuda::std::linalg::conjugated(conj_md1);
+
+    assert((conj_md2(0) == complex_t{42.f, 2.f}));
+    assert((conj_md2(1) == complex_t{43.f, 3.f}));
+  }
+  // copy constructor
+  {
+    using T = float;
+    cuda::std::array<T, 2> d{42.f, 43.f};
+    cuda::std::mdspan<T, E> md(d.data(), 2);
+    auto conj_md1 = cuda::std::linalg::conjugated(md);
+    auto conj_md2 = conj_md1;
+
+    assert(conj_md2(0) == 42.f);
+    assert(conj_md2(1) == 43.f);
+  }
+}
+
+int main(int, char**)
+{
+  constexpr_test();
+  runtime_test();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/linalg/accessors/scaled.pass.cpp b/libcudacxx/test/libcudacxx/std/linalg/accessors/scaled.pass.cpp
new file mode 100644
index 00000000000..95713669e1b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/linalg/accessors/scaled.pass.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+// UNSUPPORTED: msvc && c++17
+
+#include <cuda/std/cassert>
+#include <cuda/std/linalg>
+#include <cuda/std/type_traits>
+
+__host__ __device__ void constexpr_test()
+{
+  using T = int;
+  using E = cuda::std::extents<size_t, 2>;
+  cuda::std::array<T, 2> d{42, 43};
+  cuda::std::mdspan<T, E> md(d.data(), E{});
+  // operator() type
+  {
+    auto scaled_md = cuda::std::linalg::scaled(2.0f, md);
+
+    static_assert(cuda::std::is_same<decltype(scaled_md(0)), float>::value, "wrong type");
+    static_cast<void>(scaled_md);
+  }
+  // nested_accessor()
+  {
+    auto scaled_md = cuda::std::linalg::scaled(2, md);
+
+    static_assert(
+      cuda::std::is_same<decltype(scaled_md.accessor().nested_accessor()), cuda::std::default_accessor<T>>::value,
+      "wrong type");
+    static_cast<void>(scaled_md);
+  }
+}
+
+__host__ __device__ void runtime_test()
+{
+  using T = const int;
+  using E = cuda::std::extents<size_t, 2>;
+  cuda::std::array<T, 2> d{42, 43};
+  cuda::std::mdspan<T, E> md(d.data(), 2);
+  // operator() value
+  {
+    auto scaled_md = cuda::std::linalg::scaled(2, md);
+    assert(scaled_md(0) == 42 * 2);
+    assert(scaled_md(1) == 43 * 2);
+  }
+  // access()
+  {
+    auto scaled_md = cuda::std::linalg::scaled(2, md);
+    assert(scaled_md.accessor().access(d.data(), 1) == 43 * 2);
+  }
+  // offset()
+  {
+    auto scaled_md = cuda::std::linalg::scaled(2, md);
+    assert(scaled_md.accessor().offset(d.data(), 1) == d.data() + 1);
+  }
+  // scaling_factor()
+  {
+    auto scaled_md = cuda::std::linalg::scaled(2, md);
+    assert(scaled_md.accessor().scaling_factor() == 2);
+  }
+  // composition
+  {
+    auto scaled_md1 = cuda::std::linalg::scaled(2, md);
+    auto scaled_md2 = cuda::std::linalg::scaled(3, scaled_md1);
+    assert(scaled_md2(0) == 42 * 2 * 3);
+    assert(scaled_md2(1) == 43 * 2 * 3);
+  }
+  // copy constructor
+  {
+    auto scaled_md1 = cuda::std::linalg::scaled(2, md);
+    auto scaled_md2 = scaled_md1;
+    assert(scaled_md2(0) == 42 * 2);
+    assert(scaled_md2(1) == 43 * 2);
+  }
+}
+
+int main(int, char**)
+{
+  constexpr_test();
+  runtime_test();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/linalg/layouts/transposed.pass.cpp b/libcudacxx/test/libcudacxx/std/linalg/layouts/transposed.pass.cpp
new file mode 100644
index 00000000000..dbdf936434f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/linalg/layouts/transposed.pass.cpp
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11, c++14
+// UNSUPPORTED: msvc && c++17
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/linalg>
+#include <cuda/std/mdspan>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <typename Layout, typename Map>
+__host__ __device__ void map_test(Map map)
+{
+  using T = int;
+  using E = cuda::std::extents<size_t, 2, 3>;
+  cuda::std::array<T, 6> d{42, 43, 44, 45, 46, 47};
+  cuda::std::mdspan<T, E, Layout> md(d.data(), map);
+  auto transposed_md = cuda::std::linalg::transposed(md);
+
+  assert(transposed_md.mapping().required_span_size() == md.mapping().required_span_size());
+  assert(transposed_md.is_always_unique() == md.is_always_unique());
+  assert(transposed_md.is_always_exhaustive() == md.is_always_exhaustive());
+  assert(transposed_md.is_always_strided() == md.is_always_strided());
+  assert(transposed_md.is_unique() == md.is_unique());
+  assert(transposed_md.is_exhaustive() == md.is_exhaustive());
+  assert(transposed_md.is_strided() == md.is_strided());
+};
+
+int main(int, char**)
+{
+  using T               = int;
+  using E               = cuda::std::extents<size_t, 2, 3>;
+  using dynamic_extents = cuda::std::dextents<size_t, 2>;
+  // operator(), extent
+  {
+    cuda::std::array<T, 6> d{42, 43, 44, 45, 46, 47};
+    //     42, 43, 44
+    //     45, 46, 47
+    cuda::std::mdspan<T, dynamic_extents> md(d.data(), 2, 3);
+    auto transposed_md = cuda::std::linalg::transposed(md);
+
+    assert(transposed_md.static_extent(0) == cuda::std::dynamic_extent);
+    assert(transposed_md.static_extent(1) == cuda::std::dynamic_extent);
+    assert(transposed_md.extent(0) == 3);
+    assert(transposed_md.extent(1) == 2);
+    assert(md(0, 0) == transposed_md(0, 0));
+    assert(md(0, 1) == transposed_md(1, 0));
+    assert(md(0, 2) == transposed_md(2, 0));
+    assert(md(1, 0) == transposed_md(0, 1));
+    assert(md(1, 1) == transposed_md(1, 1));
+    assert(md(1, 2) == transposed_md(2, 1));
+  }
+  // required_span_size(), is_always_unique(), is_always_exhaustive(), is_always_strided(), is_unique(),
+  // is_exhaustive(), is_strided()
+  {
+    ::map_test<cuda::std::layout_right>(cuda::std::layout_right::mapping<E>{});
+    ::map_test<cuda::std::layout_left>(cuda::std::layout_left::mapping<E>{});
+    ::map_test<cuda::std::layout_stride>(cuda::std::layout_stride::mapping<E>{E{}, cuda::std::array<T, 2>{10, 12}});
+  }
+  // stride()
+  {
+    cuda::std::layout_stride::mapping<E> map{E{}, cuda::std::array<T, 2>{10, 12}};
+    cuda::std::array<T, 6> d{42, 43, 44, 45, 46, 47};
+    cuda::std::mdspan<T, E, cuda::std::layout_stride> md(d.data(), map);
+    auto transposed_md = cuda::std::linalg::transposed(md);
+    assert(transposed_md.stride(0) == md.stride(1));
+    assert(transposed_md.stride(1) == md.stride(0));
+  }
+  // constructor
+  {
+    using transposed_extents_t = cuda::std::extents<size_t, 3, 2>;
+    cuda::std::layout_right::mapping<transposed_extents_t> map_right{};
+    unused(cuda::std::linalg::layout_transpose<cuda::std::layout_right>::mapping<E>{map_right});
+
+    cuda::std::layout_left::mapping<transposed_extents_t> map_left{};
+    unused(cuda::std::linalg::layout_transpose<cuda::std::layout_left>::mapping<E>{map_left});
+  }
+  // operator==, operator!=
+  {
+    cuda::std::layout_right::mapping<dynamic_extents> map_right1{dynamic_extents{3, 2}};
+    cuda::std::linalg::layout_transpose<cuda::std::layout_right>::mapping<dynamic_extents> map1{map_right1};
+
+    cuda::std::layout_right::mapping<dynamic_extents> map_right2{dynamic_extents{3, 2}};
+    cuda::std::linalg::layout_transpose<cuda::std::layout_right>::mapping<dynamic_extents> map2{map_right2};
+
+    cuda::std::layout_right::mapping<dynamic_extents> map_right3{dynamic_extents{2, 2}};
+    cuda::std::linalg::layout_transpose<cuda::std::layout_right>::mapping<dynamic_extents> map3{map_right3};
+
+    assert(map1 == map2);
+    assert(map1 != map3);
+  }
+  // transposed composition
+  {
+    cuda::std::array<T, 6> d{42, 43, 44, 45, 46, 47};
+    cuda::std::mdspan<T, E> md(d.data(), E{});
+    auto transposed_md1 = cuda::std::linalg::transposed(md);
+    auto transposed_md2 = cuda::std::linalg::transposed(transposed_md1);
+    static_assert(cuda::std::is_same_v<decltype(transposed_md2.accessor()), decltype(md.accessor())>);
+    assert(transposed_md2.mapping() == md.mapping());
+    assert(transposed_md2.extents() == md.extents());
+    assert(md(0, 0) == transposed_md2(0, 0));
+    assert(md(0, 1) == transposed_md2(0, 1));
+    assert(md(0, 2) == transposed_md2(0, 2));
+    assert(md(1, 0) == transposed_md2(1, 0));
+    assert(md(1, 1) == transposed_md2(1, 1));
+    assert(md(1, 2) == transposed_md2(1, 2));
+  }
+  // copy constructor
+  {
+    cuda::std::array<T, 6> d{42, 43, 44, 45, 46, 47};
+    cuda::std::mdspan<T, E> md(d.data(), E{});
+    auto transposed_md1 = cuda::std::linalg::transposed(md);
+    auto transposed_md2 = transposed_md1;
+    cuda::std::layout_left::mapping<dynamic_extents> map_left{dynamic_extents{3, 2}};
+    assert(transposed_md2.mapping() == map_left);
+    assert(transposed_md2.extent(0) == md.extent(1));
+    assert(transposed_md2.extent(1) == md.extent(0));
+    assert(md(0, 0) == transposed_md2(0, 0));
+    assert(md(0, 1) == transposed_md2(1, 0));
+    assert(md(0, 2) == transposed_md2(2, 0));
+    assert(md(1, 0) == transposed_md2(0, 1));
+    assert(md(1, 1) == transposed_md2(1, 1));
+    assert(md(1, 2) == transposed_md2(2, 1));
+  }
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
index b4c5f35683a..5a8b24071c5 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
@@ -18,25 +18,11 @@
 #include <cuda/std/cassert>
 #include <cuda/std/cmath>
 #include <cuda/std/cstdint>
+#include <cuda/std/cstring>
 #include <cuda/std/limits>
 
 #include "test_macros.h"
 
-__host__ __device__ cuda::std::size_t test_memcmp(void* lhs, void* rhs, size_t bytes) noexcept
-{
-  const unsigned char* clhs = (const unsigned char*) lhs;
-  const unsigned char* crhs = (const unsigned char*) rhs;
-
-  for (; bytes > 0; --bytes)
-  {
-    if (*clhs++ != *crhs++)
-    {
-      return clhs[-1] < crhs[-1] ? -1 : 1;
-    }
-  }
-  return 0;
-}
-
 // cuda::std::bit_cast does not preserve padding bits, so if T has padding bits,
 // the results might not memcmp cleanly.
 template <bool HasUniqueObjectRepresentations = true, typename T>
@@ -54,9 +40,9 @@ __host__ __device__ void test_roundtrip_through_buffer(T from)
 
   _CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
   {
-    assert(test_memcmp(&from, &middle, sizeof(T)) == 0);
-    assert(test_memcmp(&to, &middle, sizeof(T)) == 0);
-    assert(test_memcmp(&middle, &middle2, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0);
   }
 }
 
@@ -77,9 +63,9 @@ __host__ __device__ void test_roundtrip_through_nested_T(T from)
 
   _CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
   {
-    assert(test_memcmp(&from, &middle, sizeof(T)) == 0);
-    assert(test_memcmp(&to, &middle, sizeof(T)) == 0);
-    assert(test_memcmp(&middle, &middle2, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0);
   }
 }
 
@@ -96,9 +82,9 @@ __host__ __device__ void test_roundtrip_through(T from)
 
   _CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
   {
-    assert(test_memcmp(&from, &middle, sizeof(T)) == 0);
-    assert(test_memcmp(&to, &middle, sizeof(T)) == 0);
-    assert(test_memcmp(&middle, &middle2, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0);
   }
 }
 
diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/exponential_functions.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/exponential_functions.pass.cpp
new file mode 100644
index 00000000000..b1e6b42a107
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/exponential_functions.pass.cpp
@@ -0,0 +1,494 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cmath>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+#if defined(TEST_COMPILER_MSVC)
+#  pragma warning(disable : 4244) // conversion from 'double' to 'float', possible loss of data
+#  pragma warning(disable : 4305) // 'argument': truncation from 'T' to 'float'
+#  pragma warning(disable : 4146) // unary minus operator applied to unsigned type, result still unsigned
+#endif // TEST_COMPILER_MSVC
+
+template <typename T>
+__host__ __device__ bool eq(T lhs, T rhs) noexcept
+{
+  return lhs == rhs;
+}
+
+template <typename T, typename U, cuda::std::enable_if_t<cuda::std::is_arithmetic<U>::value, int> = 0>
+__host__ __device__ bool eq(T lhs, U rhs) noexcept
+{
+  return eq(lhs, T(rhs));
+}
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+__host__ __device__ bool eq(__half lhs, __half rhs) noexcept
+{
+  return ::__heq(lhs, rhs);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+__host__ __device__ bool eq(__nv_bfloat16 lhs, __nv_bfloat16 rhs) noexcept
+{
+  return ::__heq(lhs, rhs);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class Integer>
+__host__ __device__ bool is_about(Integer x, Integer y)
+{
+  return true;
+}
+
+__host__ __device__ bool is_about(float x, float y)
+{
+  return (cuda::std::abs((x - y) / (x + y)) < 1.e-6);
+}
+
+__host__ __device__ bool is_about(double x, double y)
+{
+  return (cuda::std::abs((x - y) / (x + y)) < 1.e-14);
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+__host__ __device__ bool is_about(long double x, long double y)
+{
+  return (cuda::std::abs((x - y) / (x + y)) < 1.e-14);
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+__host__ __device__ bool is_about(__half x, __half y)
+{
+  return (cuda::std::fabs((x - y) / (x + y)) <= __half(1e-3));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#ifdef _LIBCUDACXX_HAS_NVBF16
+__host__ __device__ bool is_about(__nv_bfloat16 x, __nv_bfloat16 y)
+{
+  return (cuda::std::fabs((x - y) / (x + y)) <= __nv_bfloat16(5e-3));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <typename T>
+__host__ __device__ void test_exp(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::exp(T{})), ret>::value, "");
+
+  const T euler = T(2.718281828459045);
+  assert(eq(cuda::std::exp(T(-0.0)), T(1.0)));
+  if (!cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::exp(val), euler));
+    assert(eq(cuda::std::exp(T(-cuda::std::numeric_limits<T>::infinity())), T(0.0)));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::expf(val), euler));
+    assert(eq(cuda::std::expf(T(-0.0)), T(1.0)));
+    assert(eq(cuda::std::expf(T(-cuda::std::numeric_limits<T>::infinity())), T(0.0)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::expl(val), euler));
+    assert(eq(cuda::std::expl(T(-0.0)), T(1.0)));
+    assert(eq(cuda::std::expl(T(-cuda::std::numeric_limits<T>::infinity())), T(0.0)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_exp2(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::exp2(T{})), ret>::value, "");
+
+  assert(eq(cuda::std::exp2(val), T(2.0)));
+  assert(eq(cuda::std::exp2(val * T(4)), T(16.0)));
+  assert(eq(cuda::std::exp2(T(-0.0)), T(1.0)));
+  if (!cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::exp2(val * T(-4)), T(0.0625)));
+    assert(eq(cuda::std::exp2(T(-cuda::std::numeric_limits<T>::infinity())), T(0.0)));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::exp2f(val), T(2.0)));
+    assert(eq(cuda::std::exp2f(val * T(4)), T(16.0)));
+    assert(eq(cuda::std::exp2f(val * T(-4)), T(0.0625)));
+    assert(eq(cuda::std::exp2f(T(-0.0)), T(1.0)));
+    assert(eq(cuda::std::exp2f(T(-cuda::std::numeric_limits<T>::infinity())), T(0.0)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::exp2l(val), T(2.0)));
+    assert(eq(cuda::std::exp2l(val * T(4)), T(16.0)));
+    assert(eq(cuda::std::exp2l(val * T(-4)), T(0.0625)));
+    assert(eq(cuda::std::exp2l(T(-0.0)), T(1.0)));
+    assert(eq(cuda::std::exp2l(T(-cuda::std::numeric_limits<T>::infinity())), T(0.0)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_expm1(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::expm1(T{})), ret>::value, "");
+
+  const T eulerm1 = T(1.718281828459045);
+  assert(eq(cuda::std::expm1(T(-0.0)), T(-0.0)));
+  if (!cuda::std::is_integral<T>::value)
+  {
+    assert(is_about(T(cuda::std::expm1(val)), eulerm1));
+    assert(eq(cuda::std::expm1(800), cuda::std::numeric_limits<T>::infinity()));
+    assert(eq(cuda::std::expm1(T(-cuda::std::numeric_limits<T>::infinity())), T(-1)));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(is_about(T(cuda::std::expm1f(val)), eulerm1));
+    assert(eq(cuda::std::expm1f(T(-0.0)), T(-0.0)));
+    assert(eq(cuda::std::expm1f(800), cuda::std::numeric_limits<T>::infinity()));
+    assert(eq(cuda::std::expm1f(T(-cuda::std::numeric_limits<T>::infinity())), T(-1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(is_about(T(cuda::std::expm1l(val)), eulerm1));
+    assert(eq(cuda::std::expm1l(T(-0.0)), T(-0.0)));
+    assert(eq(cuda::std::expm1l(800), cuda::std::numeric_limits<T>::infinity()));
+    assert(eq(cuda::std::expm1l(T(-cuda::std::numeric_limits<T>::infinity())), T(-1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_frexp(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::frexp(T{}, nullptr)), ret>::value, "");
+
+  int exponent = -1;
+  assert(eq(cuda::std::frexp(T(0.0), &exponent), T(0.0)));
+  assert(exponent == 0);
+  exponent = -1;
+  assert(eq(cuda::std::frexp(T(-0.0), &exponent), T(0.0)));
+  assert(exponent == 0);
+  if (!cuda::std::is_integral<T>::value)
+  {
+    exponent = -1;
+    assert(eq(cuda::std::frexp(val, &exponent), T(0.5)));
+    assert(exponent == 1);
+    // exponent is undefined here
+    assert(eq(cuda::std::frexp(T(cuda::std::numeric_limits<T>::infinity()), &exponent),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::frexp(T(-cuda::std::numeric_limits<T>::infinity()), &exponent),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::frexp(T(cuda::std::numeric_limits<T>::quiet_NaN()), &exponent)));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    exponent = -1;
+    assert(eq(cuda::std::frexpf(val, &exponent), T(0.5)));
+    assert(exponent == 1);
+
+    exponent = -1;
+    assert(eq(cuda::std::frexpf(T(0.0), &exponent), T(0.0)));
+    assert(exponent == 0);
+    exponent = -1;
+    assert(eq(cuda::std::frexpf(T(-0.0), &exponent), T(0.0)));
+    assert(exponent == 0);
+
+    // exponent is undefined here
+    assert(eq(cuda::std::frexpf(T(cuda::std::numeric_limits<T>::infinity()), &exponent),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::frexpf(T(-cuda::std::numeric_limits<T>::infinity()), &exponent),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::frexpf(T(cuda::std::numeric_limits<T>::quiet_NaN()), &exponent)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    exponent = -1;
+    assert(eq(cuda::std::frexpl(val, &exponent), T(0.5)));
+    assert(exponent == 1);
+    exponent = -1;
+    assert(eq(cuda::std::frexpl(T(0.0), &exponent), T(0.0)));
+    assert(exponent == 0);
+    exponent = -1;
+    assert(eq(cuda::std::frexpl(T(-0.0), &exponent), T(0.0)));
+    assert(exponent == 0);
+
+    // exponent is undefined here
+    assert(eq(cuda::std::frexpl(T(cuda::std::numeric_limits<T>::infinity()), &exponent),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::frexpl(T(-cuda::std::numeric_limits<T>::infinity()), &exponent),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::frexpl(T(cuda::std::numeric_limits<T>::quiet_NaN()), &exponent)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_ldexp(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::ldexp(T{}, int{})), ret>::value, "");
+
+  assert(eq(cuda::std::ldexp(T(0.0), 800), T(0.0)));
+  assert(eq(cuda::std::ldexp(T(-0.0), 800), T(0.0)));
+  assert(eq(cuda::std::ldexp(val, 5), T(32.0)));
+  assert(eq(cuda::std::ldexp(val, 0), val));
+  if (!cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::ldexp(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::ldexp(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::ldexp(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::ldexpf(T(0.0), 800), T(0.0)));
+    assert(eq(cuda::std::ldexpf(T(-0.0), 800), T(0.0)));
+    assert(eq(cuda::std::ldexpf(val, 5), T(32.0)));
+    assert(eq(cuda::std::ldexpf(val, 0), val));
+    assert(eq(cuda::std::ldexpf(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::ldexpf(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::ldexpf(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::ldexpl(T(0.0), 800), T(0.0)));
+    assert(eq(cuda::std::ldexpl(T(-0.0), 800), T(0.0)));
+    assert(eq(cuda::std::ldexpl(val, 5), T(32.0)));
+    assert(eq(cuda::std::ldexpl(val, 0), val));
+    assert(eq(cuda::std::ldexpl(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::ldexpl(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::ldexpl(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_scalbln(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::scalbln(T{}, long{})), ret>::value, "");
+
+  assert(eq(cuda::std::scalbln(T(0.0), 800), T(0.0)));
+  assert(eq(cuda::std::scalbln(T(-0.0), 800), T(0.0)));
+  assert(eq(cuda::std::scalbln(val, 5), T(32.0)));
+  assert(eq(cuda::std::scalbln(val, 0), val));
+  if (!cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::scalbln(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::scalbln(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::scalbln(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::scalblnf(T(0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalblnf(T(-0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalblnf(val, 5), T(32.0)));
+    assert(eq(cuda::std::scalblnf(val, 0), val));
+    assert(eq(cuda::std::scalblnf(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::scalblnf(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::scalblnf(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::scalblnl(T(0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalblnl(T(-0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalblnl(val, 5), T(32.0)));
+    assert(eq(cuda::std::scalblnl(val, 0), val));
+    assert(eq(cuda::std::scalblnl(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::scalblnl(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::scalblnl(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_scalbn(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::scalbn(T{}, int{})), ret>::value, "");
+
+  assert(eq(cuda::std::scalbn(T(0.0), 800), T(0.0)));
+  assert(eq(cuda::std::scalbn(T(-0.0), 800), T(0.0)));
+  assert(eq(cuda::std::scalbn(val, 5), T(32.0)));
+  assert(eq(cuda::std::scalbn(val, 0), val));
+  if (!cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::scalbn(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::scalbn(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::scalbn(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::scalbnf(T(0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalbnf(T(-0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalbnf(val, 5), T(32.0)));
+    assert(eq(cuda::std::scalbnf(val, 0), val));
+    assert(eq(cuda::std::scalbnf(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::scalbnf(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::scalbnf(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::scalbnl(T(0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalbnl(T(-0.0), 800), T(0.0)));
+    assert(eq(cuda::std::scalbnl(val, 5), T(32.0)));
+    assert(eq(cuda::std::scalbnl(val, 0), val));
+    assert(eq(cuda::std::scalbnl(T(cuda::std::numeric_limits<T>::infinity()), 1),
+              T(cuda::std::numeric_limits<T>::infinity())));
+    assert(eq(cuda::std::scalbnl(T(-cuda::std::numeric_limits<T>::infinity()), 1),
+              T(-cuda::std::numeric_limits<T>::infinity())));
+    assert(cuda::std::isnan(cuda::std::scalbnl(T(cuda::std::numeric_limits<T>::quiet_NaN()), 1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_pow(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::pow(T{}, T{})), ret>::value, "");
+
+  assert(eq(cuda::std::pow(T(2.0), T(10.0)), T(1024.0)));
+  assert(eq(cuda::std::pow(val, cuda::std::numeric_limits<T>::infinity()), val));
+  assert(eq(cuda::std::pow(-val, cuda::std::numeric_limits<T>::infinity()), val));
+  assert(eq(cuda::std::pow(T(0.0), val), T(0.0)));
+  assert(eq(cuda::std::pow(T(-0.0), val), T(-0.0)));
+  if (!cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::pow(T(2), T(-3)), T(0.125)));
+
+    // Returns always 1 even for NaN
+    assert(eq(cuda::std::pow(val, cuda::std::numeric_limits<T>::quiet_NaN()), val));
+
+    assert(
+      eq(cuda::std::pow(T(2.0), cuda::std::numeric_limits<T>::infinity()), cuda::std::numeric_limits<T>::infinity()));
+    assert(eq(cuda::std::pow(T(2.0), -cuda::std::numeric_limits<T>::infinity()), T(0.0)));
+
+    assert(eq(cuda::std::pow(T(0.5), cuda::std::numeric_limits<T>::infinity()), T(0.0)));
+    assert(
+      eq(cuda::std::pow(T(0.5), -cuda::std::numeric_limits<T>::infinity()), cuda::std::numeric_limits<T>::infinity()));
+  }
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::powf(T(2), T(-3)), T(0.125)));
+
+    // Returns always 1 even for NaN
+    assert(eq(cuda::std::powf(val, cuda::std::numeric_limits<T>::quiet_NaN()), val));
+
+    assert(
+      eq(cuda::std::powf(T(2.0), cuda::std::numeric_limits<T>::infinity()), cuda::std::numeric_limits<T>::infinity()));
+    assert(eq(cuda::std::powf(T(2.0), -cuda::std::numeric_limits<T>::infinity()), T(0.0)));
+
+    assert(eq(cuda::std::powf(T(0.5), cuda::std::numeric_limits<T>::infinity()), T(0.0)));
+    assert(
+      eq(cuda::std::powf(T(0.5), -cuda::std::numeric_limits<T>::infinity()), cuda::std::numeric_limits<T>::infinity()));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::powl(T(2), T(-3)), T(0.125)));
+
+    // Returns always 1 even for NaN
+    assert(eq(cuda::std::powl(val, cuda::std::numeric_limits<T>::quiet_NaN()), val));
+
+    assert(
+      eq(cuda::std::powl(T(2.0), cuda::std::numeric_limits<T>::infinity()), cuda::std::numeric_limits<T>::infinity()));
+    assert(eq(cuda::std::powl(T(2.0), -cuda::std::numeric_limits<T>::infinity()), T(0.0)));
+
+    assert(eq(cuda::std::powl(T(0.5), cuda::std::numeric_limits<T>::infinity()), T(0.0)));
+    assert(
+      eq(cuda::std::powl(T(0.5), -cuda::std::numeric_limits<T>::infinity()), cuda::std::numeric_limits<T>::infinity()));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test(const T val)
+{
+  test_exp<T>(val);
+  test_exp2<T>(val);
+  test_expm1<T>(val);
+  test_frexp<T>(val);
+  test_ldexp<T>(val);
+  test_scalbln<T>(val);
+  test_scalbn<T>(val);
+  test_pow<T>(val);
+}
+
+__host__ __device__ void test(const float val)
+{
+  test<float>(val);
+  test<double>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test<long double>();
+#endif //!_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>(val);
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>(val);
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  test<unsigned short>(static_cast<unsigned short>(val));
+  test<int>(static_cast<int>(val));
+  test<unsigned int>(static_cast<unsigned int>(val));
+  test<long>(static_cast<long>(val));
+  test<unsigned long>(static_cast<unsigned long>(val));
+  test<long long>(static_cast<long long>(val));
+  test<unsigned long long>(static_cast<unsigned long long>(val));
+}
+
+__global__ void test_global_kernel(float* val)
+{
+  test(*val);
+}
+
+int main(int, char**)
+{
+  volatile float val = 1.0f;
+  test(val);
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/roots.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/roots.pass.cpp
new file mode 100644
index 00000000000..86abbd98cb7
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/roots.pass.cpp
@@ -0,0 +1,138 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cmath>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "fp_compare.h"
+#include "test_macros.h"
+
+#if defined(TEST_COMPILER_MSVC)
+#  pragma warning(disable : 4244) // conversion from 'double' to 'float', possible loss of data
+#  pragma warning(disable : 4146) // unary minus operator applied to unsigned type, result still unsigned
+#endif // TEST_COMPILER_MSVC
+
+template <typename T>
+__host__ __device__ bool eq(T lhs, T rhs) noexcept
+{
+  return lhs == rhs;
+}
+
+template <typename T, typename U, cuda::std::enable_if_t<cuda::std::is_arithmetic<U>::value, int> = 0>
+__host__ __device__ bool eq(T lhs, U rhs) noexcept
+{
+  return eq(lhs, T(rhs));
+}
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+__host__ __device__ bool eq(__half lhs, __half rhs) noexcept
+{
+  return ::__heq(lhs, rhs);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+__host__ __device__ bool eq(__nv_bfloat16 lhs, __nv_bfloat16 rhs) noexcept
+{
+  return ::__heq(lhs, rhs);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <typename T>
+__host__ __device__ void test_sqrt(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::sqrt(T{})), ret>::value, "");
+
+  assert(eq(cuda::std::sqrt(val), T(8.0)));
+  assert(eq(cuda::std::sqrt(T(0.0)), T(0.0)));
+  assert(eq(cuda::std::sqrt(T(cuda::std::numeric_limits<T>::infinity())), cuda::std::numeric_limits<T>::infinity()));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::sqrtf(val), T(8.0)));
+    assert(eq(cuda::std::sqrtf(T(0.0)), T(0.0)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::sqrtl(val), T(8)));
+    assert(eq(cuda::std::sqrtl(T(0.0)), T(0.0)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_cbrt(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::cbrt(T{})), ret>::value, "");
+
+  assert(eq(cuda::std::cbrt(val), T(2)));
+  assert(eq(cuda::std::cbrt(T(0.0)), T(0.0)));
+  assert(eq(cuda::std::cbrt(-T(0.0)), -T(0.0)));
+  assert(eq(cuda::std::cbrt(T(cuda::std::numeric_limits<T>::infinity())), cuda::std::numeric_limits<T>::infinity()));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::cbrtf(val), T(2)));
+    assert(eq(cuda::std::cbrtf(T(0.0)), T(0.0)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::cbrtl(val), T(2)));
+    assert(eq(cuda::std::cbrtl(T(0.0)), T(0.0)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test(const T val)
+{
+  test_sqrt<T>(val);
+  test_cbrt<T>(val / T(8));
+}
+
+__host__ __device__ void test(const float val)
+{
+  test<float>(val);
+  test<double>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test<long double>();
+#endif //!_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>(val);
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>(val);
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  test<unsigned short>(static_cast<unsigned short>(val));
+  test<int>(static_cast<int>(val));
+  test<unsigned int>(static_cast<unsigned int>(val));
+  test<long>(static_cast<long>(val));
+  test<unsigned long>(static_cast<unsigned long>(val));
+  test<long long>(static_cast<long long>(val));
+  test<unsigned long long>(static_cast<unsigned long long>(val));
+}
+
+__global__ void test_global_kernel(float* val)
+{
+  test(*val);
+}
+
+int main(int, char**)
+{
+  volatile float val = 64.f;
+  test(val);
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/rounding.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/rounding.pass.cpp
new file mode 100644
index 00000000000..6badf592046
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/rounding.pass.cpp
@@ -0,0 +1,506 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cmath>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "fp_compare.h"
+#include "test_macros.h"
+
+#if defined(TEST_COMPILER_MSVC)
+#  pragma warning(disable : 4244) // conversion from 'double' to 'float', possible loss of data
+#  pragma warning(disable : 4305) // 'argument': truncation from 'T' to 'float'
+#  pragma warning(disable : 4146) // unary minus operator applied to unsigned type, result still unsigned
+#endif // TEST_COMPILER_MSVC
+
+template <typename T>
+__host__ __device__ bool eq(T lhs, T rhs) noexcept
+{
+  return lhs == rhs;
+}
+
+template <typename T, typename U, cuda::std::enable_if_t<cuda::std::is_arithmetic<U>::value, int> = 0>
+__host__ __device__ bool eq(T lhs, U rhs) noexcept
+{
+  return eq(lhs, T(rhs));
+}
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+__host__ __device__ bool eq(__half lhs, __half rhs) noexcept
+{
+  return ::__heq(lhs, rhs);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+__host__ __device__ bool eq(__nv_bfloat16 lhs, __nv_bfloat16 rhs) noexcept
+{
+  return ::__heq(lhs, rhs);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <typename T>
+__host__ __device__ void test_ceil(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::ceil(T{})), ret>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::ceil(val), val));
+    assert(eq(cuda::std::ceil(-val), -val));
+  }
+  else
+  {
+    assert(eq(cuda::std::ceil(val), T(2)));
+    assert(eq(cuda::std::ceil(-val), T(-1)));
+  }
+  assert(eq(cuda::std::ceil(T(-0.0)), T(-0.0)));
+  assert(eq(cuda::std::ceil(T(0.0)), T(0.0)));
+  assert(eq(cuda::std::ceil(T(-cuda::std::numeric_limits<T>::infinity())), -cuda::std::numeric_limits<T>::infinity()));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::ceilf(val), T(2)));
+    assert(eq(cuda::std::ceilf(-val), T(-1)));
+    assert(eq(cuda::std::ceilf(T(-0.0)), T(-0.0)));
+    assert(eq(cuda::std::ceilf(T(0.0)), T(0.0)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::ceill(val), T(2)));
+    assert(eq(cuda::std::ceill(-val), T(-1)));
+    assert(eq(cuda::std::ceill(T(-0.0)), T(-0.0)));
+    assert(eq(cuda::std::ceill(T(0.0)), T(0.0)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_floor(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::floor(T{})), ret>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::floor(val), val));
+    assert(eq(cuda::std::floor(-val), -val));
+  }
+  else
+  {
+    assert(eq(cuda::std::floor(val), T(1)));
+    assert(eq(cuda::std::floor(-val), T(-2)));
+  }
+  assert(eq(cuda::std::floor(T(-0.0)), T(-0.0)));
+  assert(eq(cuda::std::floor(T(0.0)), T(0.0)));
+  assert(eq(cuda::std::floor(T(-cuda::std::numeric_limits<T>::infinity())), -cuda::std::numeric_limits<T>::infinity()));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::floorf(val), T(1)));
+    assert(eq(cuda::std::floorf(-val), T(-2)));
+    assert(eq(cuda::std::floorf(T(-0.0)), T(-0.0)));
+    assert(eq(cuda::std::floorf(T(0.0)), T(0.0)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::floorl(val), T(1)));
+    assert(eq(cuda::std::floorl(-val), T(-2)));
+    assert(eq(cuda::std::floorl(T(-0.0)), T(-0.0)));
+    assert(eq(cuda::std::floorl(T(0.0)), T(0.0)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_llrint(T val)
+{
+  static_assert(cuda::std::is_same<decltype(cuda::std::llrint(T{})), long long>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::llrint(val), val));
+  }
+  else
+  {
+    assert(cuda::std::llrint(val) == 2);
+    assert(cuda::std::llrint(-val) == -2);
+    assert(cuda::std::llrint(val - T(0.2)) == 1);
+    assert(cuda::std::llrint(-val + T(0.2)) == -1);
+  }
+  assert(cuda::std::llrint(T(-0.0)) == -0);
+  assert(cuda::std::llrint(T(0.0)) == 0);
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(cuda::std::llrintf(val) == 2);
+    assert(cuda::std::llrintf(-val) == -2);
+    assert(cuda::std::llrintf(val - T(0.2)) == 1);
+    assert(cuda::std::llrintf(-val + T(0.2)) == -1);
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(cuda::std::llrintl(val) == 2);
+    assert(cuda::std::llrintl(-val) == -2);
+    assert(cuda::std::llrintl(val - T(0.2)) == 1);
+    assert(cuda::std::llrintl(-val + T(0.2)) == -1);
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_llround(T val)
+{
+  static_assert(cuda::std::is_same<decltype(cuda::std::llround(T{})), long long>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::llround(val), val));
+  }
+  else
+  {
+    assert(cuda::std::llround(val) == 2);
+    assert(cuda::std::llround(-val) == -2);
+    assert(cuda::std::llround(val - T(0.2)) == 1);
+    assert(cuda::std::llround(-val + T(0.2)) == -1);
+  }
+  assert(cuda::std::llround(T(-0.0)) == -0);
+  assert(cuda::std::llround(T(0.0)) == 0);
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(cuda::std::llroundf(val) == 2);
+    assert(cuda::std::llroundf(-val) == -2);
+    assert(cuda::std::llroundf(val - T(0.2)) == 1);
+    assert(cuda::std::llroundf(-val + T(0.2)) == -1);
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(cuda::std::llroundl(val) == 2);
+    assert(cuda::std::llroundl(-val) == -2);
+    assert(cuda::std::llroundl(val - T(0.2)) == 1);
+    assert(cuda::std::llroundl(-val + T(0.2)) == -1);
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_lrint(T val)
+{
+  static_assert(cuda::std::is_same<decltype(cuda::std::lrint(T{})), long>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::lrint(val), val));
+  }
+  else
+  {
+    assert(cuda::std::lrint(val) == 2);
+    assert(cuda::std::lrint(-val) == -2);
+    assert(cuda::std::lrint(val - T(0.2)) == 1);
+    assert(cuda::std::lrint(-val + T(0.2)) == -1);
+  }
+  assert(cuda::std::lrint(T(-0.0)) == -0);
+  assert(cuda::std::lrint(T(0.0)) == 0);
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(cuda::std::lrintf(val) == 2);
+    assert(cuda::std::lrintf(-val) == -2);
+    assert(cuda::std::lrintf(val - T(0.2)) == 1);
+    assert(cuda::std::lrintf(-val + T(0.2)) == -1);
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(cuda::std::lrintl(val) == 2);
+    assert(cuda::std::lrintl(-val) == -2);
+    assert(cuda::std::lrintl(val - T(0.2)) == 1);
+    assert(cuda::std::lrintl(-val + T(0.2)) == -1);
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_lround(T val)
+{
+  static_assert(cuda::std::is_same<decltype(cuda::std::lround(T{})), long>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::lround(val), val));
+  }
+  else
+  {
+    assert(cuda::std::lround(val) == 2);
+    assert(cuda::std::lround(-val) == -2);
+    assert(cuda::std::lround(val - T(0.2)) == 1);
+    assert(cuda::std::lround(-val + T(0.2)) == -1);
+  }
+  assert(cuda::std::lround(T(-0.0)) == -0);
+  assert(cuda::std::lround(T(0.0)) == 0);
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(cuda::std::lroundf(val) == 2);
+    assert(cuda::std::lroundf(-val) == -2);
+    assert(cuda::std::lroundf(val - T(0.2)) == 1);
+    assert(cuda::std::lroundf(-val + T(0.2)) == -1);
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(cuda::std::lroundl(val) == 2);
+    assert(cuda::std::lroundl(-val) == -2);
+    assert(cuda::std::lroundl(val - T(0.2)) == 1);
+    assert(cuda::std::lroundl(-val + T(0.2)) == -1);
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_nearbyint(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::nearbyint(T{})), ret>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::nearbyint(val), val));
+    assert(eq(cuda::std::nearbyint(-val), -val));
+  }
+  else
+  {
+    assert(eq(cuda::std::nearbyint(val), 2));
+    assert(eq(cuda::std::nearbyint(-val), -2));
+    assert(eq(cuda::std::nearbyint(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::nearbyint(-val + T(0.2)), T(-1)));
+  }
+  assert(eq(cuda::std::nearbyint(T(-0.0)), T(-0.0)));
+  assert(eq(cuda::std::nearbyint(T(0.0)), T(0.0)));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::nearbyintf(val), T(2)));
+    assert(eq(cuda::std::nearbyintf(-val), T(-2)));
+    assert(eq(cuda::std::nearbyintf(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::nearbyintf(-val + T(0.2)), T(-1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::nearbyintl(val), T(2)));
+    assert(eq(cuda::std::nearbyintl(-val), T(-2)));
+    assert(eq(cuda::std::nearbyintl(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::nearbyintl(-val + T(0.2)), T(-1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_nextafter(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::nextafter(T{}, T{})), ret>::value, "");
+
+  // assert(eq(cuda::std::nextafter(cuda::std::nextafter(val, T(10)), T(-10)), val));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::nextafterf(cuda::std::nextafterf(val, T(10)), T(-10)), val));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::nextafterl(cuda::std::nextafterl(val, T(10)), T(-10)), val));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+template <typename T>
+__host__ __device__ void test_nexttoward(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::nexttoward(T{}, long double{})), ret>::value, "");
+
+  assert(eq(cuda::std::nexttoward(cuda::std::nexttoward(val, long double(10.0)), long double(-10.0)), val));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::nexttowardf(cuda::std::nexttowardf(val, long double(10.0)), long double(-10.0)), val));
+  }
+  else if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::nexttowardl(cuda::std::nexttowardl(val, long double(10.0)), long double(-10.0)), val));
+  }
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+template <typename T>
+__host__ __device__ void test_rint(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::rint(T{})), ret>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::rint(val), val));
+    assert(eq(cuda::std::rint(-val), -val));
+  }
+  else
+  {
+    assert(eq(cuda::std::rint(val), 2));
+    assert(eq(cuda::std::rint(-val), -2));
+    assert(eq(cuda::std::rint(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::rint(-val + T(0.2)), T(-1)));
+  }
+  assert(eq(cuda::std::rint(T(-0.0)), T(-0.0)));
+  assert(eq(cuda::std::rint(T(0.0)), T(0.0)));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::rintf(val), T(2)));
+    assert(eq(cuda::std::rintf(-val), T(-2)));
+    assert(eq(cuda::std::rintf(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::rintf(-val + T(0.2)), T(-1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::rintl(val), T(2)));
+    assert(eq(cuda::std::rintl(-val), T(-2)));
+    assert(eq(cuda::std::rintl(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::rintl(-val + T(0.2)), T(-1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_round(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::round(T{})), ret>::value, "");
+
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::round(val), val));
+    assert(eq(cuda::std::round(-val), -val));
+  }
+  else
+  {
+    assert(eq(cuda::std::round(val), 2));
+    assert(eq(cuda::std::round(-val), -2));
+    assert(eq(cuda::std::round(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::round(-val + T(0.2)), T(-1)));
+  }
+  assert(eq(cuda::std::round(T(-0.0)), T(-0.0)));
+  assert(eq(cuda::std::round(T(0.0)), T(0.0)));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::roundf(val), T(2)));
+    assert(eq(cuda::std::roundf(-val), T(-2)));
+    assert(eq(cuda::std::roundf(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::roundf(-val + T(0.2)), T(-1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::roundl(val), T(2)));
+    assert(eq(cuda::std::roundl(-val), T(-2)));
+    assert(eq(cuda::std::roundl(val - T(0.2)), T(1)));
+    assert(eq(cuda::std::roundl(-val + T(0.2)), T(-1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test_trunc(T val)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::trunc(T{})), ret>::value, "");
+  if (cuda::std::is_integral<T>::value)
+  {
+    assert(eq(cuda::std::trunc(val), val));
+    assert(eq(cuda::std::trunc(-val), -val));
+  }
+  else
+  {
+    assert(eq(cuda::std::trunc(val), T(1)));
+    assert(eq(cuda::std::trunc(-val), T(-1)));
+  }
+  assert(eq(cuda::std::trunc(T(-0.0)), T(-0.0)));
+  assert(eq(cuda::std::trunc(T(0.0)), T(0.0)));
+  assert(eq(cuda::std::trunc(T(-cuda::std::numeric_limits<T>::infinity())), -cuda::std::numeric_limits<T>::infinity()));
+  if (cuda::std::is_same<T, float>::value)
+  {
+    assert(eq(cuda::std::truncf(val), T(1)));
+    assert(eq(cuda::std::truncf(-val), T(-1)));
+  }
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  else if (cuda::std::is_same<T, long double>::value)
+  {
+    assert(eq(cuda::std::truncl(val), T(1)));
+    assert(eq(cuda::std::truncl(-val), T(-1)));
+  }
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <typename T>
+__host__ __device__ void test(const T val)
+{
+  test_ceil<T>(val);
+  test_floor<T>(val);
+  test_llrint<T>(val);
+  test_llround<T>(val);
+  test_lrint<T>(val);
+  test_lround<T>(val);
+  test_nearbyint<T>(val);
+  test_nextafter<T>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_nexttoward<T>(val);
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  test_rint<T>(val);
+  test_round<T>(val);
+  test_trunc<T>(val);
+}
+
+__host__ __device__ void test(const float val)
+{
+  test<float>(val);
+  test<double>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test<long double>();
+#endif //!_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>(val);
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>(val);
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  test<unsigned short>(static_cast<unsigned short>(val));
+  test<int>(static_cast<int>(val));
+  test<unsigned int>(static_cast<unsigned int>(val));
+  test<long>(static_cast<long>(val));
+  test<unsigned long>(static_cast<unsigned long>(val));
+  test<long long>(static_cast<long long>(val));
+  test<unsigned long long>(static_cast<unsigned long long>(val));
+}
+
+__global__ void test_global_kernel(float* val)
+{
+  test(*val);
+}
+
+int main(int, char**)
+{
+  volatile float val = 1.6f;
+  test(val);
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
index 1baff1fe485..79b0a07d862 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// keep this test in sync with `is_floating_point.pass.cpp` for `cuda::is_floating_point`
+
 // type_traits
 
 // is_floating_point
diff --git a/libcudacxx/test/public_headers_host_only/CMakeLists.txt b/libcudacxx/test/public_headers_host_only/CMakeLists.txt
index 57b6c70ede9..f3de3e65f91 100644
--- a/libcudacxx/test/public_headers_host_only/CMakeLists.txt
+++ b/libcudacxx/test/public_headers_host_only/CMakeLists.txt
@@ -2,12 +2,18 @@
 # without anything else but also pretents to be a std header
 add_custom_target(libcudacxx.test.public_headers_host_only)
 
+if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  find_package(NVHPC)
+else()
+  find_package(CUDAToolkit)
+endif()
+
 # Grep all public headers
 file(GLOB public_headers_host_only
   LIST_DIRECTORIES false
   RELATIVE "${libcudacxx_SOURCE_DIR}/include/"
   CONFIGURE_DEPENDS
-  "${libcudacxx_SOURCE_DIR}/include/cuda/std/*"
+  "${libcudacxx_SOURCE_DIR}/include/cuda/*"
 )
 
 # mdspan is currently not supported on msvc outside of C++20
@@ -34,6 +40,13 @@ function(libcudacxx_add_std_header_test header)
   endif()
   target_compile_definitions(headertest_std_${header_name} PRIVATE CCCL_IGNORE_DEPRECATED_CPP_DIALECT)
 
+  # We want to ensure that we can build headers within <cuda/> with a host compiler but we need cuda_runtime_api.h
+  if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    target_link_libraries(headertest_std_${header_name} NVHPC::CUDART)
+  else()
+    target_link_libraries(headertest_std_${header_name} CUDA::cudart)
+  endif()
+
   add_dependencies(libcudacxx.test.public_headers_host_only headertest_std_${header_name})
 endfunction()
 
diff --git a/python/cuda_cccl/.gitignore b/python/cuda_cccl/.gitignore
new file mode 100644
index 00000000000..24ec757199f
--- /dev/null
+++ b/python/cuda_cccl/.gitignore
@@ -0,0 +1,2 @@
+cuda/cccl/include
+*egg-info
diff --git a/python/cuda_cccl/README.md b/python/cuda_cccl/README.md
new file mode 100644
index 00000000000..37f020b6df6
--- /dev/null
+++ b/python/cuda_cccl/README.md
@@ -0,0 +1,3 @@
+## Note
+
+This package is currently FOR INTERNAL USE ONLY and not meant to be used/installed explicitly.
diff --git a/python/cuda_cccl/cuda/cccl/__init__.py b/python/cuda_cccl/cuda/cccl/__init__.py
new file mode 100644
index 00000000000..5288f071942
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from cuda.cccl._version import __version__
+from cuda.cccl.include_paths import get_include_paths
+
+__all__ = ["__version__", "get_include_paths"]
diff --git a/python/cuda_cccl/cuda/cccl/_version.py b/python/cuda_cccl/cuda/cccl/_version.py
new file mode 100644
index 00000000000..ec7c29a266e
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/_version.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This file is generated by ci/update_version.sh
+# Do not edit this file manually.
+__version__ = "2.8.0"
diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py
new file mode 100644
index 00000000000..da8246b9195
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/include_paths.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import shutil
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+
+
+def _get_cuda_path() -> Optional[Path]:
+    cuda_path = os.environ.get("CUDA_PATH")
+    if cuda_path:
+        cuda_path = Path(cuda_path)
+        if cuda_path.exists():
+            return cuda_path
+
+    nvcc_path = shutil.which("nvcc")
+    if nvcc_path:
+        return Path(nvcc_path).parent.parent
+
+    default_path = Path("/usr/local/cuda")
+    if default_path.exists():
+        return default_path
+
+    return None
+
+
+@dataclass
+class IncludePaths:
+    cuda: Optional[Path]
+    libcudacxx: Optional[Path]
+    cub: Optional[Path]
+    thrust: Optional[Path]
+
+    def as_tuple(self):
+        # Note: higher-level ... lower-level order:
+        return (self.thrust, self.cub, self.libcudacxx, self.cuda)
+
+
+@lru_cache()
+def get_include_paths() -> IncludePaths:
+    # TODO: once docs env supports Python >= 3.9, we
+    # can move this to a module-level import.
+    from importlib.resources import as_file, files
+
+    cuda_incl = None
+    cuda_path = _get_cuda_path()
+    if cuda_path is not None:
+        cuda_incl = cuda_path / "include"
+
+    with as_file(files("cuda.cccl.include")) as f:
+        cccl_incl = Path(f)
+    assert cccl_incl.exists()
+
+    return IncludePaths(
+        cuda=cuda_incl,
+        libcudacxx=cccl_incl / "libcudacxx",
+        cub=cccl_incl,
+        thrust=cccl_incl,
+    )
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
new file mode 100644
index 00000000000..ada06301a4c
--- /dev/null
+++ b/python/cuda_cccl/pyproject.toml
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+[build-system]
+requires = ["setuptools>=61.0.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "cuda-cccl"
+description = "Experimental Package with CCCL headers to support JIT compilation"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dynamic = ["version", "readme"]
+
+[project.urls]
+Homepage = "https://github.com/NVIDIA/cccl"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.cccl._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
+[tool.setuptools.package-data]
+cuda = ["cccl/include/**/*"]
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
new file mode 100644
index 00000000000..f6e5e3fa033
--- /dev/null
+++ b/python/cuda_cccl/setup.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import shutil
+from pathlib import Path
+
+from setuptools import setup
+from setuptools.command.build_py import build_py
+
+PROJECT_PATH = Path(__file__).resolve().parent
+CCCL_PATH = PROJECT_PATH.parents[1]
+
+
+class CustomBuildPy(build_py):
+    """Copy CCCL headers BEFORE super().run()
+
+    Note that the CCCL headers cannot be referenced directly:
+    setuptools (and pyproject.toml) does not support relative paths that
+    reference files outside the package directory (like ../../).
+    This is a restriction designed to avoid inadvertently packaging files
+    that are outside the source tree.
+    """
+
+    def run(self):
+        cccl_headers = [
+            ("cub", "cub"),
+            ("libcudacxx", "include"),
+            ("thrust", "thrust"),
+        ]
+
+        inc_path = PROJECT_PATH / "cuda" / "cccl" / "include"
+        inc_path.mkdir(parents=True, exist_ok=True)
+
+        for proj_dir, header_dir in cccl_headers:
+            src_path = CCCL_PATH / proj_dir / header_dir
+            dst_path = inc_path / proj_dir
+            if dst_path.exists():
+                shutil.rmtree(dst_path)
+            shutil.copytree(src_path, dst_path)
+
+        init_py_path = inc_path / "__init__.py"
+        init_py_path.write_text("# Intentionally empty.\n")
+
+        super().run()
+
+
+setup(
+    license_files=["../../LICENSE"],
+    cmdclass={"build_py": CustomBuildPy},
+)
diff --git a/python/cuda_cooperative/.gitignore b/python/cuda_cooperative/.gitignore
index 15c09b246c1..a9904c10554 100644
--- a/python/cuda_cooperative/.gitignore
+++ b/python/cuda_cooperative/.gitignore
@@ -1,3 +1,2 @@
-cuda/_include
 env
 *egg-info
diff --git a/python/cuda_cooperative/MANIFEST.in b/python/cuda_cooperative/MANIFEST.in
deleted file mode 100644
index 848cbfe2e81..00000000000
--- a/python/cuda_cooperative/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include cuda/_include *
diff --git a/python/cuda_cooperative/README.md b/python/cuda_cooperative/README.md
index c202d1d6c17..673e130bbe0 100644
--- a/python/cuda_cooperative/README.md
+++ b/python/cuda_cooperative/README.md
@@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 ## Local development
 
 ```bash
+pip3 install -e ../cuda_cccl
 pip3 install -e .[test]
 pytest -v ./tests/
 ```
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index 1e86dd45dfe..a1d269fd987 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -3,9 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import functools
-import importlib.resources as pkg_resources
-import os
-import shutil
 
 from cuda.bindings import nvrtc
 from cuda.cooperative.experimental._caching import disk_cache
@@ -20,22 +17,6 @@ def CHECK_NVRTC(err, prog):
         raise RuntimeError(f"NVRTC error: {log.decode('ascii')}")
 
 
-def get_cuda_path():
-    cuda_path = os.environ.get("CUDA_PATH", "")
-    if os.path.exists(cuda_path):
-        return cuda_path
-
-    nvcc_path = shutil.which("nvcc")
-    if nvcc_path is not None:
-        return os.path.dirname(os.path.dirname(nvcc_path))
-
-    default_path = "/usr/local/cuda"
-    if os.path.exists(default_path):
-        return default_path
-
-    return None
-
-
 # cpp is the C++ source code
 # cc = 800 for Ampere, 900 Hopper, etc
 # rdc is true or false
@@ -47,24 +28,15 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     check_in("rdc", rdc, [True, False])
     check_in("code", code, ["lto", "ptx"])
 
-    with pkg_resources.path("cuda", "_include") as include_path:
-        # Using `.parent` for compatibility with pip install --editable:
-        include_path = pkg_resources.files("cuda.cooperative").parent.joinpath(
-            "_include"
-        )
-        cub_path = include_path
-        thrust_path = include_path
-        libcudacxx_path = os.path.join(include_path, "libcudacxx")
-        cuda_include_path = os.path.join(get_cuda_path(), "include")
-
-    opts = [
-        b"--std=c++17",
-        bytes(f"--include-path={cub_path}", encoding="ascii"),
-        bytes(f"--include-path={thrust_path}", encoding="ascii"),
-        bytes(f"--include-path={libcudacxx_path}", encoding="ascii"),
-        bytes(f"--include-path={cuda_include_path}", encoding="ascii"),
-        bytes(f"--gpu-architecture=compute_{cc}", encoding="ascii"),
-    ]
+    opts = [b"--std=c++17"]
+
+    # TODO: move this to a module-level import (after docs env modernization).
+    from cuda.cccl import get_include_paths
+
+    for path in get_include_paths().as_tuple():
+        if path is not None:
+            opts += [f"--include-path={path}".encode("ascii")]
+    opts += [f"--gpu-architecture=compute_{cc}".encode("ascii")]
     if rdc:
         opts += [b"--relocatable-device-code=true"]
 
diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 017c0be1e56..788e1e6d5d8 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -1,11 +1,41 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+requires = ["setuptools>=61.0.0"]
 build-backend = "setuptools.build_meta"
 
+[project]
+name = "cuda-cooperative"
+description = "Experimental Core Library for CUDA Python"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dependencies = [
+  "cuda-cccl",
+  "numpy",
+  "numba>=0.60.0",
+  "pynvjitlink-cu12>=0.2.4",
+  "cuda-python==12.*",
+  "jinja2",
+]
+dynamic = ["version", "readme"]
+
+[project.optional-dependencies]
+test = ["pytest", "pytest-xdist"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.cooperative._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
 [tool.ruff]
 extend = "../../pyproject.toml"
 
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 5f954086cfe..b8dd6502515 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -1,91 +1,9 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
-import shutil
-
-from setuptools import Command, find_namespace_packages, setup
-from setuptools.command.build_py import build_py
-from wheel.bdist_wheel import bdist_wheel
-
-project_path = os.path.abspath(os.path.dirname(__file__))
-cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
-cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-__version__ = None
-with open(os.path.join(project_path, "cuda", "cooperative", "_version.py")) as f:
-    exec(f.read())
-assert __version__ is not None
-ver = __version__
-del __version__
-
-
-with open("README.md") as f:
-    long_description = f.read()
-
-
-class CustomBuildCommand(build_py):
-    def run(self):
-        self.run_command("package_cccl")
-        build_py.run(self)
-
-
-class CustomWheelBuild(bdist_wheel):
-    def run(self):
-        self.run_command("package_cccl")
-        super().run()
-
-
-class PackageCCCLCommand(Command):
-    description = "Generate additional files"
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        for proj_dir, header_dir in cccl_headers:
-            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
-            if os.path.exists(dst_path):
-                shutil.rmtree(dst_path)
-            shutil.copytree(src_path, dst_path)
-
+from setuptools import setup
 
 setup(
-    name="cuda-cooperative",
-    version=ver,
-    description="Experimental Core Library for CUDA Python",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="NVIDIA Corporation",
-    classifiers=[
-        "Programming Language :: Python :: 3 :: Only",
-        "Environment :: GPU :: NVIDIA CUDA",
-    ],
-    packages=find_namespace_packages(include=["cuda.*"]),
-    python_requires=">=3.9",
-    install_requires=[
-        "numba>=0.60.0",
-        "pynvjitlink-cu12>=0.2.4",
-        "cuda-python",
-        "jinja2",
-    ],
-    extras_require={
-        "test": [
-            "pytest",
-            "pytest-xdist",
-        ]
-    },
-    cmdclass={
-        "package_cccl": PackageCCCLCommand,
-        "build_py": CustomBuildCommand,
-        "bdist_wheel": CustomWheelBuild,
-    },
-    include_package_data=True,
-    license="Apache-2.0 with LLVM exception",
-    license_files=("../../LICENSE",),
+    license_files=["../../LICENSE"],
 )
diff --git a/python/cuda_parallel/.gitignore b/python/cuda_parallel/.gitignore
index 8e0d030ff6a..7fc9da1604e 100644
--- a/python/cuda_parallel/.gitignore
+++ b/python/cuda_parallel/.gitignore
@@ -1,4 +1,3 @@
-cuda/_include
 env
 *egg-info
 *so
diff --git a/python/cuda_parallel/MANIFEST.in b/python/cuda_parallel/MANIFEST.in
deleted file mode 100644
index 848cbfe2e81..00000000000
--- a/python/cuda_parallel/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include cuda/_include *
diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md
index 98a3a3c92d0..1dad4b0f03e 100644
--- a/python/cuda_parallel/README.md
+++ b/python/cuda_parallel/README.md
@@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 ## Local development
 
 ```bash
+pip3 install -e ../cuda_cccl
 pip3 install -e .[test]
 pytest -v ./tests/
 ```
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
index c19ceebbf3e..ffc35ee2a87 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -4,28 +4,12 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import ctypes
-import os
-import shutil
 from functools import lru_cache
-from typing import List, Optional
+from typing import List
 
-from . import _cccl as cccl
-
-
-def _get_cuda_path() -> Optional[str]:
-    cuda_path = os.environ.get("CUDA_PATH", "")
-    if os.path.exists(cuda_path):
-        return cuda_path
-
-    nvcc_path = shutil.which("nvcc")
-    if nvcc_path is not None:
-        return os.path.dirname(os.path.dirname(nvcc_path))
-
-    default_path = "/usr/local/cuda"
-    if os.path.exists(default_path):
-        return default_path
+from cuda.cccl import get_include_paths  # type: ignore[import-not-found]
 
-    return None
+from . import _cccl as cccl
 
 
 @lru_cache()
@@ -55,27 +39,9 @@ def get_bindings() -> ctypes.CDLL:
 
 @lru_cache()
 def get_paths() -> List[bytes]:
-    # TODO: once docs env supports Python >= 3.9, we
-    # can move this to a module-level import.
-    from importlib.resources import as_file, files
-
-    with as_file(files("cuda.parallel")) as f:
-        # Using `.parent` for compatibility with pip install --editable:
-        cub_include_path = str(f.parent / "_include")
-    thrust_include_path = cub_include_path
-    libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx"))
-    cuda_include_path = None
-    cuda_path = _get_cuda_path()
-    if cuda_path is not None:
-        cuda_include_path = str(os.path.join(cuda_path, "include"))
     paths = [
         f"-I{path}".encode()
-        for path in (
-            cub_include_path,
-            thrust_include_path,
-            libcudacxx_include_path,
-            cuda_include_path,
-        )
+        for path in get_include_paths().as_tuple()
         if path is not None
     ]
     return paths
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
index e09191dac2c..e231f721238 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
@@ -1,7 +1,8 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from __future__ import annotations
 
 import ctypes
 import functools
@@ -10,8 +11,9 @@
 import numpy as np
 from numba import cuda, types
 
-from ._utils.cai import DeviceArrayLike, get_dtype, is_contiguous
+from ._utils.protocols import get_dtype, is_contiguous
 from .iterators._iterators import IteratorBase
+from .typing import DeviceArrayLike, GpuStruct
 
 
 # MUST match `cccl_type_enum` in c/include/cccl/c/types.h
@@ -121,6 +123,10 @@ def _type_to_enum(numba_type: types.Type) -> TypeEnum:
 def _numba_type_to_info(numba_type: types.Type) -> TypeInfo:
     context = cuda.descriptor.cuda_target.target_context
     value_type = context.get_value_type(numba_type)
+    if isinstance(numba_type, types.Record):
+        # then `value_type` is a pointer and we need the
+        # alignment of the pointee.
+        value_type = value_type.pointee
     size = value_type.get_abi_size(context.target_data)
     alignment = value_type.get_abi_alignment(context.target_data)
     return TypeInfo(size, alignment, _type_to_enum(numba_type))
@@ -209,6 +215,11 @@ def to_cccl_iter(array_or_iterator) -> Iterator:
     return _device_array_to_cccl_iter(array_or_iterator)
 
 
-def host_array_to_value(array: np.ndarray) -> Value:
-    info = _numpy_type_to_info(array.dtype)
-    return Value(info, array.ctypes.data)
+def to_cccl_value(array_or_struct: np.ndarray | GpuStruct) -> Value:
+    if isinstance(array_or_struct, np.ndarray):
+        info = _numpy_type_to_info(array_or_struct.dtype)
+        data = ctypes.cast(array_or_struct.ctypes.data, ctypes.c_void_p)
+        return Value(info, data)
+    else:
+        # it's a GpuStruct, use the array underlying it
+        return to_cccl_value(array_or_struct._data)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/protocols.py
similarity index 52%
rename from python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
rename to python/cuda_parallel/cuda/parallel/experimental/_utils/protocols.py
index 4d435171aad..d62717115cb 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_utils/protocols.py
@@ -1,10 +1,10 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 """
-Utilities for extracting information from `__cuda_array_interface__`.
+Utilities for extracting information from protocols such as `__cuda_array_interface__` and `__cuda_stream__`.
 """
 
 from typing import Optional, Tuple
@@ -15,7 +15,14 @@
 
 
 def get_dtype(arr: DeviceArrayLike) -> np.dtype:
-    return np.dtype(arr.__cuda_array_interface__["typestr"])
+    typestr = arr.__cuda_array_interface__["typestr"]
+
+    if typestr.startswith("|V"):
+        # it's a structured dtype, use the descr field:
+        return np.dtype(arr.__cuda_array_interface__["descr"])
+    else:
+        # a simple dtype, use the typestr field:
+        return np.dtype(typestr)
 
 
 def get_strides(arr: DeviceArrayLike) -> Optional[Tuple]:
@@ -61,3 +68,30 @@ def is_contiguous(arr: DeviceArrayLike) -> bool:
     else:
         # not contiguous
         return False
+
+
+def validate_and_get_stream(stream) -> Optional[int]:
+    # null stream is allowed
+    if stream is None:
+        return None
+
+    try:
+        stream_property = stream.__cuda_stream__()
+    except AttributeError as e:
+        raise TypeError(
+            f"stream argument {stream} does not implement the '__cuda_stream__' protocol"
+        ) from e
+
+    try:
+        version, handle, *_ = stream_property
+    except (TypeError, ValueError) as e:
+        raise TypeError(
+            f"could not obtain __cuda_stream__ protocol version and handle from {stream_property}"
+        ) from e
+
+    if version == 0:
+        if not isinstance(handle, int):
+            raise TypeError(f"invalid stream handle {handle}")
+        return handle
+
+    raise TypeError(f"unsupported __cuda_stream__ version {version}")
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index 7a1a26bbc9f..f0b73f2b51d 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -16,17 +16,18 @@
 from .. import _cccl as cccl
 from .._bindings import get_bindings, get_paths
 from .._caching import CachableFunction, cache_with_key
-from .._utils import cai
+from .._utils import protocols
 from ..iterators._iterators import IteratorBase
-from ..typing import DeviceArrayLike
+from ..typing import DeviceArrayLike, GpuStruct
 
 
 class _Op:
-    def __init__(self, dtype: np.dtype, op: Callable):
-        value_type = numba.from_dtype(dtype)
-        self.ltoir, _ = cuda.compile(
-            op, sig=value_type(value_type, value_type), output="ltoir"
-        )
+    def __init__(self, h_init: np.ndarray | GpuStruct, op: Callable):
+        if isinstance(h_init, np.ndarray):
+            value_type = numba.from_dtype(h_init.dtype)
+        else:
+            value_type = numba.typeof(h_init)
+        self.ltoir, _ = cuda.compile(op, sig=(value_type, value_type), output="ltoir")
         self.name = op.__name__.encode("utf-8")
 
     def handle(self) -> cccl.Op:
@@ -53,28 +54,30 @@ def __init__(
         d_in: DeviceArrayLike | IteratorBase,
         d_out: DeviceArrayLike,
         op: Callable,
-        h_init: np.ndarray,
+        h_init: np.ndarray | GpuStruct,
     ):
+        # Referenced from __del__:
+        self.build_result = None
+
         d_in_cccl = cccl.to_cccl_iter(d_in)
         self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
             d_in_cccl.value_type.type.value
         )
-        self._ctor_d_out_dtype = cai.get_dtype(d_out)
+        self._ctor_d_out_dtype = protocols.get_dtype(d_out)
         self._ctor_init_dtype = h_init.dtype
         cc_major, cc_minor = cuda.get_current_device().compute_capability
         cub_path, thrust_path, libcudacxx_path, cuda_include_path = get_paths()
         bindings = get_bindings()
-        self.op_wrapper = _Op(h_init.dtype, op)
+        self.op_wrapper = _Op(h_init, op)
         d_out_cccl = cccl.to_cccl_iter(d_out)
         self.build_result = cccl.DeviceReduceBuildResult()
 
-        # TODO Figure out caching
         error = bindings.cccl_device_reduce_build(
             ctypes.byref(self.build_result),
             d_in_cccl,
             d_out_cccl,
             self.op_wrapper.handle(),
-            cccl.host_array_to_value(h_init),
+            cccl.to_cccl_value(h_init),
             cc_major,
             cc_minor,
             ctypes.c_char_p(cub_path),
@@ -85,7 +88,15 @@ def __init__(
         if error != enums.CUDA_SUCCESS:
             raise ValueError("Error building reduce")
 
-    def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray):
+    def __call__(
+        self,
+        temp_storage,
+        d_in,
+        d_out,
+        num_items: int,
+        h_init: np.ndarray | GpuStruct,
+        stream=None,
+    ):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         if d_in_cccl.type.value == cccl.IteratorKind.ITERATOR:
             assert num_items is not None
@@ -99,8 +110,9 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
             self._ctor_d_in_cccl_type_enum_name,
             cccl.type_enum_as_name(d_in_cccl.value_type.type.value),
         )
-        _dtype_validation(self._ctor_d_out_dtype, d_out.dtype)
+        _dtype_validation(self._ctor_d_out_dtype, protocols.get_dtype(d_out))
         _dtype_validation(self._ctor_init_dtype, h_init.dtype)
+        stream_handle = protocols.validate_and_get_stream(stream)
         bindings = get_bindings()
         if temp_storage is None:
             temp_storage_bytes = ctypes.c_size_t()
@@ -119,8 +131,8 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
             d_out_cccl,
             ctypes.c_ulonglong(num_items),
             self.op_wrapper.handle(),
-            cccl.host_array_to_value(h_init),
-            None,
+            cccl.to_cccl_value(h_init),
+            stream_handle,
         )
         if error != enums.CUDA_SUCCESS:
             raise ValueError("Error reducing")
@@ -128,6 +140,8 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
         return temp_storage_bytes.value
 
     def __del__(self):
+        if self.build_result is None:
+            return
         bindings = get_bindings()
         bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
 
@@ -138,8 +152,10 @@ def make_cache_key(
     op: Callable,
     h_init: np.ndarray,
 ):
-    d_in_key = d_in.kind if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in)
-    d_out_key = cai.get_dtype(d_out)
+    d_in_key = (
+        d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
+    )
+    d_out_key = protocols.get_dtype(d_out)
     op_key = CachableFunction(op)
     h_init_key = h_init.dtype
     return (d_in_key, d_out_key, op_key, h_init_key)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/struct.py b/python/cuda_parallel/cuda/parallel/experimental/struct.py
new file mode 100644
index 00000000000..3ca09d39676
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/struct.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from dataclasses import dataclass
+from dataclasses import fields as dataclass_fields
+from typing import Type
+
+import numba
+import numpy as np
+from numba.core import cgutils
+from numba.core.extending import (
+    make_attribute_wrapper,
+    models,
+    register_model,
+    typeof_impl,
+)
+from numba.core.typing import signature as nb_signature
+from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
+from numba.cuda.cudadecl import registry as cuda_registry
+from numba.extending import lower_builtin
+
+from .typing import GpuStruct
+
+
+def gpu_struct(this: type) -> Type[GpuStruct]:
+    """
+    Defines the given class as being a GpuStruct.
+
+    A GpuStruct represents a value composed of one or more other
+    values, and is defined as a class with annotated fields (similar
+    to a dataclass). The type of each field must be a subclass of
+    `np.number`, like `np.int32` or `np.float64`.
+
+    Arrays of GPUStruct objects can be used as inputs to cuda.parallel
+    algorithms.
+
+    Example:
+        The code snippet below shows how to use `gpu_struct` to define
+        a `Pixel` type (composed of `r`, `g` and `b` values),  and perform
+        a reduction on an array of `Pixel` objects to identify the one
+        with the largest `g` component:
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin reduce-struct
+            :end-before: example-end reduce-struct
+    """
+    # Implementation-wise, @gpu_struct creates and registers a
+    # corresponding numba type to the given type, so that it can be
+    # used within device functions (e.g., unary and binary operations)
+    # The numba typing/lowering code is largely based on the example
+    # in # https://github.com/gmarkall/numba-accelerated-udfs/blob/e78876c5d3ace9e1409d37029bd79b2a8b706c62/filigree/numba_extension.py
+
+    anns = getattr(this, "__annotations__", {})
+
+    # Set a .dtype attribute on the class that returns the
+    # corresponding numpy structure dtype. This makes it convenient to
+    # create CuPy/NumPy arrays of this type.
+    setattr(this, "dtype", np.dtype(list(anns.items())))
+
+    # Define __post_init__ to create a numpy struct from the fields,
+    # and keep a reference to it in the `._data` attribute. The data
+    # underlying this array is what is ultimately passed to the C
+    # library, and we need to keep a reference to it for the lifetime
+    # of the object.
+    def __post_init__(self):
+        self._data = np.array(
+            [tuple(getattr(self, name) for name in anns)], dtype=self.dtype
+        )
+
+    setattr(this, "__post_init__", __post_init__)
+
+    # Wrap `this` in a dataclass for convenience:
+    this = dataclass(this)
+    fields = dataclass_fields(this)
+
+    # Define a numba type corresponding to `this`:
+    class ThisType(numba.types.Type):
+        def __init__(self):
+            super().__init__(name=this.__name__)
+
+    this_type = ThisType()
+
+    @typeof_impl.register(this)
+    def typeof_this(val, c):
+        return ThisType()
+
+    # Data model corresponding to ThisType:
+    @register_model(ThisType)
+    class ThisModel(models.StructModel):
+        def __init__(self, dmm, fe_type):
+            members = [(field.name, numba.from_dtype(field.type)) for field in fields]
+            super().__init__(dmm, fe_type, members)
+
+    # Typing for accessing attributes (fields) of the dataclass:
+    class ThisAttrsTemplate(AttributeTemplate):
+        pass
+
+    for field in fields:
+        typ = field.type
+        name = field.name
+
+        def resolver(self, this):
+            return numba.from_dtype(typ)
+
+        setattr(ThisAttrsTemplate, f"resolve_{name}", resolver)
+
+    @cuda_registry.register_attr
+    class ThisAttrs(ThisAttrsTemplate):
+        key = this_type
+
+    # Lowering for attribute access:
+    for field in fields:
+        make_attribute_wrapper(ThisType, field.name, field.name)
+
+    # Typing for constructor.
+    @cuda_registry.register
+    class ThisConstructor(ConcreteTemplate):
+        key = this
+        cases = [
+            nb_signature(this_type, *[numba.from_dtype(field.type) for field in fields])
+        ]
+
+    cuda_registry.register_global(this, numba.types.Function(ThisConstructor))
+
+    # Lowering for constructor:
+    def this_constructor(context, builder, sig, args):
+        ty = sig.return_type
+        retval = cgutils.create_struct_proxy(ty)(context, builder)
+        for field, val in zip(fields, args):
+            setattr(retval, field.name, val)
+        return retval._getvalue()
+
+    lower_builtin(this, *[numba.from_dtype(field.type) for field in fields])(
+        this_constructor
+    )
+
+    return this
diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py
index 1c4e9c9975f..38b63a60e51 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/typing.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/typing.py
@@ -1,3 +1,10 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Any
+
 from typing_extensions import (
     Protocol,
 )  # TODO: typing_extensions required for Python 3.7 docs env
@@ -10,3 +17,11 @@ class DeviceArrayLike(Protocol):
     """
 
     __cuda_array_interface__: dict
+
+
+# TODO: type GpuStruct appropriately. It should be any type that has
+# been decorated with `@gpu_struct`.
+GpuStruct = Any
+GpuStruct.__doc__ = """\
+    Type of instances of classes decorated with @gpu_struct.
+"""
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index c73736e496a..e7d2b9f0081 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -1,19 +1,39 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+requires = ["setuptools>=61.0.0"]
 build-backend = "setuptools.build_meta"
 
+[project]
+name = "cuda-parallel"
+description = "Experimental Core Library for CUDA Python"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python==12.*"]
+dynamic = ["version", "readme"]
+
+[project.optional-dependencies]
+test = ["pytest", "pytest-xdist", "cupy-cuda12x", "typing_extensions"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.parallel._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
 [tool.mypy]
 python_version = "3.10"
 
 [[tool.mypy.overrides]]
-module = [
-    "numba.*",
-    "llvmlite"
-]
+module = ["numba.*", "llvmlite"]
 ignore_missing_imports = true
 follow_imports = "skip"
 
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index bb7cbb3ac44..c5c9fcd3c32 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -1,61 +1,15 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
-import shutil
 import subprocess
+from pathlib import Path
 
-from setuptools import Command, Extension, find_namespace_packages, setup
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
-from setuptools.command.build_py import build_py
-from wheel.bdist_wheel import bdist_wheel
 
-project_path = os.path.abspath(os.path.dirname(__file__))
-cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
-cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-__version__ = None
-with open(os.path.join(project_path, "cuda", "parallel", "_version.py")) as f:
-    exec(f.read())
-assert __version__ is not None
-ver = __version__
-del __version__
-
-
-with open("README.md") as f:
-    long_description = f.read()
-
-
-class CustomBuildCommand(build_py):
-    def run(self):
-        self.run_command("package_cccl")
-        build_py.run(self)
-
-
-class CustomWheelBuild(bdist_wheel):
-    def run(self):
-        self.run_command("package_cccl")
-        super().run()
-
-
-class PackageCCCLCommand(Command):
-    description = "Generate additional files"
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        for proj_dir, header_dir in cccl_headers:
-            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-            # TODO Extract cccl headers into a standalone package
-            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
-            if os.path.exists(dst_path):
-                shutil.rmtree(dst_path)
-            shutil.copytree(src_path, dst_path)
+CCCL_PYTHON_PATH = Path(__file__).resolve().parents[1]
+CCCL_PATH = CCCL_PYTHON_PATH.parent
 
 
 class CMakeExtension(Extension):
@@ -69,53 +23,27 @@ def run(self):
             self.build_extension(ext)
 
     def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        extdir = Path(self.get_ext_fullpath(ext.name)).resolve().parent
         cmake_args = [
             "-DCCCL_ENABLE_C=YES",
-            "-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+            f"-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY={extdir}",
             "-DCMAKE_BUILD_TYPE=Release",
         ]
 
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
+        build_temp_path = Path(self.build_temp)
+        build_temp_path.mkdir(parents=True, exist_ok=True)
 
-        subprocess.check_call(["cmake", cccl_path] + cmake_args, cwd=self.build_temp)
+        subprocess.check_call(["cmake", CCCL_PATH] + cmake_args, cwd=build_temp_path)
         subprocess.check_call(
             ["cmake", "--build", ".", "--target", "cccl.c.parallel"],
-            cwd=self.build_temp,
+            cwd=build_temp_path,
         )
 
 
 setup(
-    name="cuda-parallel",
-    version=ver,
-    description="Experimental Core Library for CUDA Python",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="NVIDIA Corporation",
-    classifiers=[
-        "Programming Language :: Python :: 3 :: Only",
-        "Environment :: GPU :: NVIDIA CUDA",
-    ],
-    packages=find_namespace_packages(include=["cuda.*"]),
-    python_requires=">=3.9",
-    # TODO: typing_extensions required for Python 3.7 docs env
-    install_requires=["numba>=0.60.0", "cuda-python", "jinja2", "typing_extensions"],
-    extras_require={
-        "test": [
-            "pytest",
-            "pytest-xdist",
-            "cupy-cuda12x",
-        ]
-    },
+    license_files=["../../LICENSE"],
     cmdclass={
-        "package_cccl": PackageCCCLCommand,
-        "build_py": CustomBuildCommand,
-        "bdist_wheel": CustomWheelBuild,
         "build_ext": BuildCMakeExtension,
     },
     ext_modules=[CMakeExtension("cuda.parallel.experimental.cccl.c")],
-    include_package_data=True,
-    license="Apache-2.0 with LLVM exception",
-    license_files=("../../LICENSE",),
 )
diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
index 9549ef7bee3..65710954b0b 100644
--- a/python/cuda_parallel/tests/test_reduce.py
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -550,3 +550,108 @@ def binary_op(x, y):
     d_in = cp.zeros(size)[::2]
     with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."):
         _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init)
+
+
+def test_reduce_with_stream():
+    # Simple cupy stream wrapper that implements the __cuda_stream__ protocol for the purposes of this test
+    class Stream:
+        def __init__(self, cp_stream):
+            self.cp_stream = cp_stream
+
+        def __cuda_stream__(self):
+            return (0, self.cp_stream.ptr)
+
+    def add_op(x, y):
+        return x + y
+
+    h_init = np.asarray([0], dtype=np.int32)
+    h_in = random_int(5, np.int32)
+
+    stream = cp.cuda.Stream()
+    with stream:
+        d_in = cp.asarray(h_in)
+        d_out = cp.empty(1, dtype=np.int32)
+
+    stream_wrapper = Stream(stream)
+    reduce_into = algorithms.reduce_into(
+        d_in=d_in, d_out=d_out, op=add_op, h_init=h_init
+    )
+    temp_storage_size = reduce_into(
+        None,
+        d_in=d_in,
+        d_out=d_out,
+        num_items=d_in.size,
+        h_init=h_init,
+        stream=stream_wrapper,
+    )
+    with stream:
+        d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+
+    reduce_into(d_temp_storage, d_in, d_out, d_in.size, h_init, stream=stream_wrapper)
+    with stream:
+        cp.testing.assert_allclose(d_in.sum().get(), d_out.get())
+
+
+def test_reduce_invalid_stream():
+    # Invalid stream that doesn't implement __cuda_stream__
+    class Stream1:
+        def __init__(self):
+            pass
+
+    # Invalid stream that implements __cuda_stream__ but returns the wrong type
+    class Stream2:
+        def __init__(self):
+            pass
+
+        def __cuda_stream__(self):
+            return None
+
+    # Invalid stream that returns an invalid handle
+    class Stream3:
+        def __init__(self):
+            pass
+
+        def __cuda_stream__(self):
+            return (0, None)
+
+    def add_op(x, y):
+        return x + y
+
+    d_out = cp.empty(1)
+    h_init = np.empty(1)
+    d_in = cp.empty(1)
+    reduce_into = algorithms.reduce_into(d_in, d_out, add_op, h_init)
+
+    with pytest.raises(
+        TypeError, match="does not implement the '__cuda_stream__' protocol"
+    ):
+        _ = reduce_into(
+            None,
+            d_in=d_in,
+            d_out=d_out,
+            num_items=d_in.size,
+            h_init=h_init,
+            stream=Stream1(),
+        )
+
+    with pytest.raises(
+        TypeError, match="could not obtain __cuda_stream__ protocol version and handle"
+    ):
+        _ = reduce_into(
+            None,
+            d_in=d_in,
+            d_out=d_out,
+            num_items=d_in.size,
+            h_init=h_init,
+            stream=Stream2(),
+        )
+
+    with pytest.raises(TypeError, match="invalid stream handle"):
+        _ = reduce_into(
+            None,
+            d_in=d_in,
+            d_out=d_out,
+            num_items=d_in.size,
+            h_init=h_init,
+            stream=Stream3(),
+        )
diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
index c8c20f51cd7..c920824fa54 100644
--- a/python/cuda_parallel/tests/test_reduce_api.py
+++ b/python/cuda_parallel/tests/test_reduce_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
@@ -178,3 +178,38 @@ def square_op(a):
     )
     assert (d_output == expected_output).all()
     # example-end transform-iterator
+
+
+def test_reduce_struct_type():
+    # example-begin reduce-struct
+    import cupy as cp
+    import numpy as np
+
+    from cuda.parallel.experimental import algorithms
+    from cuda.parallel.experimental.struct import gpu_struct
+
+    @gpu_struct
+    class Pixel:
+        r: np.int32
+        g: np.int32
+        b: np.int32
+
+    def max_g_value(x, y):
+        return x if x.g > y.g else y
+
+    d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype)
+    d_out = cp.empty(1, Pixel.dtype)
+
+    h_init = Pixel(0, 0, 0)
+
+    reduce_into = algorithms.reduce_into(d_rgb, d_out, max_g_value, h_init)
+    temp_storage_bytes = reduce_into(None, d_rgb, d_out, len(d_rgb), h_init)
+
+    d_temp_storage = cp.empty(temp_storage_bytes, dtype=np.uint8)
+    _ = reduce_into(d_temp_storage, d_rgb, d_out, len(d_rgb), h_init)
+
+    h_rgb = d_rgb.get()
+    expected = h_rgb[h_rgb.view("int32")[:, 1].argmax()]
+
+    np.testing.assert_equal(expected["g"], d_out.get()["g"])
+    # example-end reduce-struct
diff --git a/test/cmake/test_export/CMakeLists.txt b/test/cmake/test_export/CMakeLists.txt
index 38e5f8ec1ef..d2c44d84e9e 100644
--- a/test/cmake/test_export/CMakeLists.txt
+++ b/test/cmake/test_export/CMakeLists.txt
@@ -81,6 +81,7 @@ foreach (component IN LISTS COMPONENTS)
   set(test_target version_check.${component})
   set(component_target "${component}::${component}")
   add_executable(${test_target} version_check.cxx)
+  target_compile_features(${test_target} PUBLIC cxx_std_17)
   target_link_libraries(${test_target} PRIVATE ${component_target})
   add_test(NAME ${test_target} COMMAND ${test_target})
 
diff --git a/thrust/examples/set_operations.cu b/thrust/examples/set_operations.cu
index 2303fb9b69c..8a99b7ca14c 100644
--- a/thrust/examples/set_operations.cu
+++ b/thrust/examples/set_operations.cu
@@ -1,4 +1,5 @@
 #include <thrust/device_vector.h>
+#include <thrust/extrema.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/merge.h>
 #include <thrust/set_operations.h>
diff --git a/thrust/testing/async/exclusive_scan/large_indices.cu b/thrust/testing/async/exclusive_scan/large_indices.cu
index d03b7fa5662..d3b7e6fabab 100644
--- a/thrust/testing/async/exclusive_scan/large_indices.cu
+++ b/thrust/testing/async/exclusive_scan/large_indices.cu
@@ -196,17 +196,17 @@ namespace
 {
 
 //------------------------------------------------------------------------------
-// Generate the output sequence using counting iterators and thrust::max<> for
+// Generate the output sequence using counting iterators and ::cuda::maximum<> for
 // custom operator overloads.
 struct custom_bin_op_overloads
 {
   using postfix_args_type = std::tuple< // List any extra arg overloads:
-    std::tuple<uint64_t, thrust::maximum<>> // - initial_value, binop
+    std::tuple<uint64_t, ::cuda::maximum<>> // - initial_value, binop
     >;
 
   static postfix_args_type generate_postfix_args()
   {
-    return postfix_args_type{std::make_tuple(0, thrust::maximum<>{})};
+    return postfix_args_type{std::make_tuple(0, ::cuda::maximum<>{})};
   }
 };
 
diff --git a/thrust/testing/async/exclusive_scan/mixin.h b/thrust/testing/async/exclusive_scan/mixin.h
index bb6b4fe7aca..2ba787c6c8e 100644
--- a/thrust/testing/async/exclusive_scan/mixin.h
+++ b/thrust/testing/async/exclusive_scan/mixin.h
@@ -93,12 +93,14 @@ struct simple
     PostfixArgTuple&& postfix_tuple,
     std::index_sequence<PostfixArgIndices...>)
   {
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     auto e = thrust::async::exclusive_scan(
       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
       input.cbegin(),
       input.cend(),
       output.begin(),
       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    _CCCL_SUPPRESS_DEPRECATED_POP
     return e;
   }
 };
diff --git a/thrust/testing/async/exclusive_scan/using_vs_adl.cu b/thrust/testing/async/exclusive_scan/using_vs_adl.cu
index 003136bc50e..fe2bfb2e53e 100644
--- a/thrust/testing/async/exclusive_scan/using_vs_adl.cu
+++ b/thrust/testing/async/exclusive_scan/using_vs_adl.cu
@@ -29,7 +29,7 @@ struct adl_host_synchronous
     thrust::host_vector<input_value_type> host_input(input.cbegin(), input.cend());
     thrust::host_vector<output_value_type> host_output(host_input.size());
 
-    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+    using OutIter = cuda::std::remove_cvref_t<decltype(host_output.begin())>;
 
     // ADL should resolve this to the synchronous `thrust::` algorithm.
     // This is checked by ensuring that the call returns an output iterator.
@@ -69,6 +69,7 @@ struct using_namespace
     // Importing the CPO into the current namespace should unambiguously resolve
     // this call to the CPO, as opposed to resolving to the thrust:: algorithm
     // via ADL. This is verified by checking that an event is returned.
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     using namespace thrust::async;
     thrust::device_event e = exclusive_scan(
       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
@@ -76,6 +77,7 @@ struct using_namespace
       input.cend(),
       output.begin(),
       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    _CCCL_SUPPRESS_DEPRECATED_POP
     return e;
   }
 };
@@ -100,12 +102,14 @@ struct using_cpo
     // this call to the CPO, as opposed to resolving to the thrust:: algorithm
     // via ADL. This is verified by checking that an event is returned.
     using thrust::async::exclusive_scan;
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     thrust::device_event e = exclusive_scan(
       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
       input.cbegin(),
       input.cend(),
       output.begin(),
       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    _CCCL_SUPPRESS_DEPRECATED_POP
     return e;
   }
 };
diff --git a/thrust/testing/async/inclusive_scan/large_indices.cu b/thrust/testing/async/inclusive_scan/large_indices.cu
index 1eb292cc995..aacb19c88ef 100644
--- a/thrust/testing/async/inclusive_scan/large_indices.cu
+++ b/thrust/testing/async/inclusive_scan/large_indices.cu
@@ -191,17 +191,17 @@ namespace
 {
 
 //------------------------------------------------------------------------------
-// Generate the output sequence using counting iterators and thrust::max<> for
+// Generate the output sequence using counting iterators and ::cuda::maximum<> for
 // custom operator overloads.
 struct custom_bin_op_overloads
 {
   using postfix_args_type = std::tuple< // List any extra arg overloads:
-    std::tuple<thrust::maximum<>> // - custom binary op
+    std::tuple<::cuda::maximum<>> // - custom binary op
     >;
 
   static postfix_args_type generate_postfix_args()
   {
-    return postfix_args_type{std::make_tuple(thrust::maximum<>{})};
+    return postfix_args_type{std::make_tuple(::cuda::maximum<>{})};
   }
 };
 
diff --git a/thrust/testing/async/inclusive_scan/mixin.h b/thrust/testing/async/inclusive_scan/mixin.h
index f3538f5eb94..453b5583681 100644
--- a/thrust/testing/async/inclusive_scan/mixin.h
+++ b/thrust/testing/async/inclusive_scan/mixin.h
@@ -109,12 +109,14 @@ struct simple
     PostfixArgTuple&& postfix_tuple,
     std::index_sequence<PostfixArgIndices...>)
   {
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     auto e = thrust::async::inclusive_scan(
       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
       input.cbegin(),
       input.cend(),
       output.begin(),
       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    _CCCL_SUPPRESS_DEPRECATED_POP
     return e;
   }
 };
diff --git a/thrust/testing/async/inclusive_scan/using_vs_adl.cu b/thrust/testing/async/inclusive_scan/using_vs_adl.cu
index ff7270ad065..1eb01aba5cd 100644
--- a/thrust/testing/async/inclusive_scan/using_vs_adl.cu
+++ b/thrust/testing/async/inclusive_scan/using_vs_adl.cu
@@ -29,7 +29,7 @@ struct adl_host_synchronous
     thrust::host_vector<input_value_type> host_input(input.cbegin(), input.cend());
     thrust::host_vector<output_value_type> host_output(host_input.size());
 
-    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+    using OutIter = cuda::std::remove_cvref_t<decltype(host_output.begin())>;
 
     // ADL should resolve this to the synchronous `thrust::` algorithm.
     // This is checked by ensuring that the call returns an output iterator.
@@ -69,6 +69,7 @@ struct using_namespace
     // Importing the CPO into the current namespace should unambiguously resolve
     // this call to the CPO, as opposed to resolving to the thrust:: algorithm
     // via ADL. This is verified by checking that an event is returned.
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     using namespace thrust::async;
     thrust::device_event e = inclusive_scan(
       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
@@ -76,6 +77,7 @@ struct using_namespace
       input.cend(),
       output.begin(),
       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    _CCCL_SUPPRESS_DEPRECATED_POP
     return e;
   }
 };
@@ -100,12 +102,14 @@ struct using_cpo
     // this call to the CPO, as opposed to resolving to the thrust:: algorithm
     // via ADL. This is verified by checking that an event is returned.
     using thrust::async::inclusive_scan;
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     thrust::device_event e = inclusive_scan(
       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
       input.cbegin(),
       input.cend(),
       output.begin(),
       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    _CCCL_SUPPRESS_DEPRECATED_POP
     return e;
   }
 };
diff --git a/thrust/testing/async/test_policy_overloads.h b/thrust/testing/async/test_policy_overloads.h
index 219a0b98f3b..60278e1ce2e 100644
--- a/thrust/testing/async/test_policy_overloads.h
+++ b/thrust/testing/async/test_policy_overloads.h
@@ -147,7 +147,7 @@ struct test_policy_overloads
   {
     // Sink the prefix tuple into a const local so it can be safely passed to
     // multiple invocations without worrying about potential modifications.
-    using prefix_tuple_type              = thrust::remove_cvref_t<PrefixArgTuple>;
+    using prefix_tuple_type              = cuda::std::remove_cvref_t<PrefixArgTuple>;
     prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref);
 
     using postfix_tuple_type               = std::tuple_element_t<PostfixIdx, postfix_args_type>;
diff --git a/thrust/testing/async_copy.cu b/thrust/testing/async_copy.cu
index cd18d83a782..ce3451dc829 100644
--- a/thrust/testing/async_copy.cu
+++ b/thrust/testing/async_copy.cu
@@ -10,6 +10,8 @@
 #  include <unittest/unittest.h>
 #  include <unittest/util_async.h>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 #  define DEFINE_ASYNC_COPY_CALLABLE(name, ...)                                               \
     struct THRUST_PP_CAT2(name, _fn)                                                          \
     {                                                                                         \
diff --git a/thrust/testing/async_for_each.cu b/thrust/testing/async_for_each.cu
index a4ca2771b72..f939fe85c7e 100644
--- a/thrust/testing/async_for_each.cu
+++ b/thrust/testing/async_for_each.cu
@@ -8,6 +8,8 @@
 
 #  include <unittest/unittest.h>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 #  define DEFINE_ASYNC_FOR_EACH_CALLABLE(name, ...)                                           \
     struct THRUST_PP_CAT2(name, _fn)                                                          \
     {                                                                                         \
@@ -21,7 +23,6 @@
     /**/
 
 DEFINE_ASYNC_FOR_EACH_CALLABLE(invoke_async_for_each);
-
 DEFINE_ASYNC_FOR_EACH_CALLABLE(invoke_async_for_each_device, thrust::device);
 
 #  undef DEFINE_ASYNC_FOR_EACH_CALLABLE
diff --git a/thrust/testing/async_reduce.cu b/thrust/testing/async_reduce.cu
index 85fcb3ef395..2d471c55aca 100644
--- a/thrust/testing/async_reduce.cu
+++ b/thrust/testing/async_reduce.cu
@@ -12,6 +12,8 @@
 #  include <unittest/unittest.h>
 #  include <unittest/util_async.h>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 template <typename T>
 struct custom_plus
 {
@@ -539,12 +541,16 @@ struct test_async_reduce_using
     // When you import the customization points into the global namespace,
     // they should be selected instead of the synchronous algorithms.
     {
+      _CCCL_SUPPRESS_DEPRECATED_PUSH
       using namespace thrust::async;
       f0a = reduce(d0a.begin(), d0a.end());
+      _CCCL_SUPPRESS_DEPRECATED_POP
     }
     {
+      _CCCL_SUPPRESS_DEPRECATED_PUSH
       using thrust::async::reduce;
       f0b = reduce(d0b.begin(), d0b.end());
+      _CCCL_SUPPRESS_DEPRECATED_POP
     }
 
     // ADL should find the synchronous algorithms.
diff --git a/thrust/testing/async_reduce_into.cu b/thrust/testing/async_reduce_into.cu
index e757ac3a708..3cde0e68f6d 100644
--- a/thrust/testing/async_reduce_into.cu
+++ b/thrust/testing/async_reduce_into.cu
@@ -13,6 +13,8 @@
 #  include <unittest/unittest.h>
 #  include <unittest/util_async.h>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 template <typename T>
 struct custom_plus
 {
diff --git a/thrust/testing/async_sort.cu b/thrust/testing/async_sort.cu
index 77144779814..b7de68fceca 100644
--- a/thrust/testing/async_sort.cu
+++ b/thrust/testing/async_sort.cu
@@ -15,6 +15,8 @@
 
 #  include <unittest/unittest.h>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 enum wait_policy
 {
   wait_for_futures,
diff --git a/thrust/testing/async_transform.cu b/thrust/testing/async_transform.cu
index bfb30006ff2..70821ec9dfa 100644
--- a/thrust/testing/async_transform.cu
+++ b/thrust/testing/async_transform.cu
@@ -10,6 +10,8 @@
 #  include <unittest/unittest.h>
 #  include <unittest/util_async.h>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 template <typename T>
 struct divide_by_2
 {
@@ -395,12 +397,16 @@ struct test_async_transform_using
       // When you import the customization points into the global namespace,
       // they should be selected instead of the synchronous algorithms.
       {
+        _CCCL_SUPPRESS_DEPRECATED_PUSH
         using namespace thrust::async;
         f0a = transform(d0a.begin(), d0a.end(), d1a.begin(), op);
+        _CCCL_SUPPRESS_DEPRECATED_POP
       }
       {
+        _CCCL_SUPPRESS_DEPRECATED_PUSH
         using thrust::async::transform;
         f0b = transform(d0b.begin(), d0b.end(), d1b.begin(), op);
+        _CCCL_SUPPRESS_DEPRECATED_POP
       }
 
       // ADL should find the synchronous algorithms.
diff --git a/thrust/testing/cuda/is_partitioned.cu b/thrust/testing/cuda/is_partitioned.cu
index 1e02ca38c28..35f59b13ffa 100644
--- a/thrust/testing/cuda/is_partitioned.cu
+++ b/thrust/testing/cuda/is_partitioned.cu
@@ -26,7 +26,7 @@ void TestIsPartitionedDevice(ExecutionPolicy exec)
 {
   size_t n = 1000;
 
-  n = thrust::max<size_t>(n, 2);
+  n = ::cuda::std::max<size_t>(n, 2);
 
   thrust::device_vector<int> v = unittest::random_integers<int>(n);
 
diff --git a/thrust/testing/min_and_max.cu b/thrust/testing/min_and_max.cu
index 7b01fe37450..23355a44157 100644
--- a/thrust/testing/min_and_max.cu
+++ b/thrust/testing/min_and_max.cu
@@ -9,32 +9,28 @@ struct TestMin
   {
     // 2 < 3
     T two(2), three(3);
-    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two, three));
-    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two, three, thrust::less<T>()));
+    ASSERT_EQUAL(two, ::cuda::std::min(two, three));
+    ASSERT_EQUAL(two, ::cuda::std::min(two, three, thrust::less<T>()));
 
-    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(three, two));
-    ASSERT_EQUAL(two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(three, two, thrust::less<T>()));
+    ASSERT_EQUAL(two, ::cuda::std::min(three, two));
+    ASSERT_EQUAL(two, ::cuda::std::min(three, two, thrust::less<T>()));
 
-    ASSERT_EQUAL(three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two, three, thrust::greater<T>()));
-    ASSERT_EQUAL(three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(three, two, thrust::greater<T>()));
+    ASSERT_EQUAL(three, ::cuda::std::min(two, three, thrust::greater<T>()));
+    ASSERT_EQUAL(three, ::cuda::std::min(three, two, thrust::greater<T>()));
 
     using KV = key_value<T, T>;
     KV two_and_two(two, two);
     KV two_and_three(two, three);
 
     // the first element breaks ties
-    ASSERT_EQUAL_QUIET(two_and_two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_two, two_and_three));
-    ASSERT_EQUAL_QUIET(two_and_three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_three, two_and_two));
+    ASSERT_EQUAL_QUIET(two_and_two, ::cuda::std::min(two_and_two, two_and_three));
+    ASSERT_EQUAL_QUIET(two_and_three, ::cuda::std::min(two_and_three, two_and_two));
 
-    ASSERT_EQUAL_QUIET(two_and_two,
-                       thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_two, two_and_three, thrust::less<KV>()));
-    ASSERT_EQUAL_QUIET(two_and_three,
-                       thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_three, two_and_two, thrust::less<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_two, ::cuda::std::min(two_and_two, two_and_three, thrust::less<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, ::cuda::std::min(two_and_three, two_and_two, thrust::less<KV>()));
 
-    ASSERT_EQUAL_QUIET(
-      two_and_two, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_two, two_and_three, thrust::greater<KV>()));
-    ASSERT_EQUAL_QUIET(
-      two_and_three, thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_three, two_and_two, thrust::greater<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_two, ::cuda::std::min(two_and_two, two_and_three, thrust::greater<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, ::cuda::std::min(two_and_three, two_and_two, thrust::greater<KV>()));
   }
 };
 SimpleUnitTest<TestMin, NumericTypes> TestMinInstance;
@@ -46,32 +42,28 @@ struct TestMax
   {
     // 2 < 3
     T two(2), three(3);
-    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two, three));
-    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two, three, thrust::less<T>()));
+    ASSERT_EQUAL(three, ::cuda::std::max(two, three));
+    ASSERT_EQUAL(three, ::cuda::std::max(two, three, thrust::less<T>()));
 
-    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(three, two));
-    ASSERT_EQUAL(three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(three, two, thrust::less<T>()));
+    ASSERT_EQUAL(three, ::cuda::std::max(three, two));
+    ASSERT_EQUAL(three, ::cuda::std::max(three, two, thrust::less<T>()));
 
-    ASSERT_EQUAL(two, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two, three, thrust::greater<T>()));
-    ASSERT_EQUAL(two, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(three, two, thrust::greater<T>()));
+    ASSERT_EQUAL(two, ::cuda::std::max(two, three, thrust::greater<T>()));
+    ASSERT_EQUAL(two, ::cuda::std::max(three, two, thrust::greater<T>()));
 
     using KV = key_value<T, T>;
     KV two_and_two(two, two);
     KV two_and_three(two, three);
 
     // the first element breaks ties
-    ASSERT_EQUAL_QUIET(two_and_two, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_two, two_and_three));
-    ASSERT_EQUAL_QUIET(two_and_three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_three, two_and_two));
+    ASSERT_EQUAL_QUIET(two_and_two, ::cuda::std::max(two_and_two, two_and_three));
+    ASSERT_EQUAL_QUIET(two_and_three, ::cuda::std::max(two_and_three, two_and_two));
 
-    ASSERT_EQUAL_QUIET(two_and_two,
-                       thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_two, two_and_three, thrust::less<KV>()));
-    ASSERT_EQUAL_QUIET(two_and_three,
-                       thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_three, two_and_two, thrust::less<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_two, ::cuda::std::max(two_and_two, two_and_three, thrust::less<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, ::cuda::std::max(two_and_three, two_and_two, thrust::less<KV>()));
 
-    ASSERT_EQUAL_QUIET(
-      two_and_two, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_two, two_and_three, thrust::greater<KV>()));
-    ASSERT_EQUAL_QUIET(
-      two_and_three, thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(two_and_three, two_and_two, thrust::greater<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_two, ::cuda::std::max(two_and_two, two_and_three, thrust::greater<KV>()));
+    ASSERT_EQUAL_QUIET(two_and_three, ::cuda::std::max(two_and_three, two_and_two, thrust::greater<KV>()));
   }
 };
 SimpleUnitTest<TestMax, NumericTypes> TestMaxInstance;
diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu
index c30eee013b6..8e8985dde1c 100644
--- a/thrust/testing/scan.cu
+++ b/thrust/testing/scan.cu
@@ -8,19 +8,11 @@
 #include <thrust/iterator/retag.h>
 #include <thrust/scan.h>
 
+#include <cuda/functional>
 #include <cuda/std/array>
 
 #include <unittest/unittest.h>
 
-template <typename T>
-struct max_functor
-{
-  _CCCL_HOST_DEVICE T operator()(T rhs, T lhs) const
-  {
-    return thrust::max(rhs, lhs);
-  }
-};
-
 template <class Vector>
 void TestScanSimple()
 {
@@ -289,12 +281,12 @@ struct TestScanWithOperator
     thrust::host_vector<T> h_output(n);
     thrust::device_vector<T> d_output(n);
 
-    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), max_functor<T>());
-    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), max_functor<T>());
+    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), cuda::maximum<T>{});
+    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), cuda::maximum<T>{});
     ASSERT_EQUAL(d_output, h_output);
 
-    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), T(13), max_functor<T>());
-    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), T(13), max_functor<T>());
+    thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), T(13), cuda::maximum<T>{});
+    thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), T(13), cuda::maximum<T>{});
     ASSERT_EQUAL(d_output, h_output);
   }
 };
@@ -311,19 +303,19 @@ struct TestScanWithOperatorToDiscardIterator
     thrust::discard_iterator<> reference(n);
 
     thrust::discard_iterator<> h_result =
-      thrust::inclusive_scan(h_input.begin(), h_input.end(), thrust::make_discard_iterator(), max_functor<T>());
+      thrust::inclusive_scan(h_input.begin(), h_input.end(), thrust::make_discard_iterator(), cuda::maximum<T>{});
 
     thrust::discard_iterator<> d_result =
-      thrust::inclusive_scan(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), max_functor<T>());
+      thrust::inclusive_scan(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), cuda::maximum<T>{});
 
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 
-    h_result =
-      thrust::exclusive_scan(h_input.begin(), h_input.end(), thrust::make_discard_iterator(), T(13), max_functor<T>());
+    h_result = thrust::exclusive_scan(
+      h_input.begin(), h_input.end(), thrust::make_discard_iterator(), T(13), cuda::maximum<T>{});
 
-    d_result =
-      thrust::exclusive_scan(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), T(13), max_functor<T>());
+    d_result = thrust::exclusive_scan(
+      d_input.begin(), d_input.end(), thrust::make_discard_iterator(), T(13), cuda::maximum<T>{});
 
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
diff --git a/thrust/testing/tuple.cu b/thrust/testing/tuple.cu
index 4be47b1de81..cccffec0081 100644
--- a/thrust/testing/tuple.cu
+++ b/thrust/testing/tuple.cu
@@ -529,6 +529,7 @@ DECLARE_UNITTEST(TestTupleCTAD);
 #endif // _CCCL_STD_VER >= 2017
 
 // Ensure that we are backwards compatible with the old thrust::tuple implementation
+_CCCL_SUPPRESS_DEPRECATED_PUSH
 static_assert(
   thrust::tuple_size<thrust::tuple<thrust::null_type,
                                    thrust::null_type,
@@ -625,3 +626,4 @@ static_assert(
 static_assert(
   thrust::tuple_size<thrust::tuple<int, int, int, int, int, int, int, int, int, thrust::null_type>>::value == 9, "");
 static_assert(thrust::tuple_size<thrust::tuple<int, int, int, int, int, int, int, int, int, int>>::value == 10, "");
+_CCCL_SUPPRESS_DEPRECATED_POP
diff --git a/thrust/testing/unittest/testframework.h b/thrust/testing/unittest/testframework.h
index cd5799e0ca0..50f9cef3c86 100644
--- a/thrust/testing/unittest/testframework.h
+++ b/thrust/testing/unittest/testframework.h
@@ -2,15 +2,17 @@
 
 #include <thrust/detail/config.h>
 
-#include <thrust/detail/integer_traits.h>
 #include <thrust/limits.h>
 #include <thrust/mr/allocator.h>
 #include <thrust/mr/device_memory_resource.h>
 #include <thrust/mr/host_memory_resource.h>
 #include <thrust/mr/universal_memory_resource.h>
 
+#include <cuda/std/limits>
+
 #include <cstdio>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <set>
 #include <string>
@@ -199,23 +201,18 @@ class custom_numeric
   }
 };
 
-THRUST_NAMESPACE_BEGIN
-
+namespace std
+{
 template <>
 struct numeric_limits<custom_numeric> : numeric_limits<int>
 {};
+} // namespace std
 
-namespace detail
-{
-
-// For random number generation
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
-class integer_traits<custom_numeric> : public integer_traits_base<int, INT_MIN, INT_MAX>
+struct numeric_limits<custom_numeric> : numeric_limits<int>
 {};
-
-} // namespace detail
-
-THRUST_NAMESPACE_END
+_LIBCUDACXX_END_NAMESPACE_STD
 
 using NumericTypes = unittest::type_list<
   char,
diff --git a/thrust/testing/unittest/util.h b/thrust/testing/unittest/util.h
index 39c4ea962e8..7618d02643e 100644
--- a/thrust/testing/unittest/util.h
+++ b/thrust/testing/unittest/util.h
@@ -26,7 +26,7 @@ typename THRUST_NS_QUALIFIER::detail::disable_if<::cuda::std::is_floating_point<
 truncate_to_max_representable(std::size_t n)
 {
   return static_cast<T>(
-    THRUST_NS_QUALIFIER::min<std::size_t>(n, static_cast<std::size_t>(THRUST_NS_QUALIFIER::numeric_limits<T>::max())));
+    THRUST_NS_QUALIFIER::min<std::size_t>(n, static_cast<std::size_t>(::cuda::std::numeric_limits<T>::max())));
 }
 
 // TODO: This probably won't work for `half`.
@@ -34,7 +34,7 @@ template <typename T>
 typename ::cuda::std::enable_if_t<::cuda::std::is_floating_point<T>::value, T>
 truncate_to_max_representable(std::size_t n)
 {
-  return THRUST_NS_QUALIFIER::min<T>(static_cast<T>(n), THRUST_NS_QUALIFIER::numeric_limits<T>::max());
+  return THRUST_NS_QUALIFIER::min<T>(static_cast<T>(n), ::cuda::std::numeric_limits<T>::max());
 }
 
 } // namespace unittest
diff --git a/thrust/thrust/async/copy.h b/thrust/thrust/async/copy.h
index 1adc90c3dff..0439b096c97 100644
--- a/thrust/thrust/async/copy.h
+++ b/thrust/thrust/async/copy.h
@@ -36,7 +36,8 @@
 #  include <thrust/detail/static_assert.h>
 #  include <thrust/event.h>
 #  include <thrust/system/detail/adl/async/copy.h>
-#  include <thrust/type_traits/remove_cvref.h>
+
+#  include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -50,7 +51,7 @@ namespace unimplemented
 {
 
 template <typename FromPolicy, typename ToPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
-_CCCL_HOST event<FromPolicy> async_copy(
+CCCL_DEPRECATED _CCCL_HOST event<FromPolicy> async_copy(
   thrust::execution_policy<FromPolicy>& from_exec,
   thrust::execution_policy<ToPolicy>& to_exec,
   ForwardIt first,
@@ -72,7 +73,7 @@ using thrust::async::unimplemented::async_copy;
 struct copy_fn final
 {
   template <typename FromPolicy, typename ToPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
-  _CCCL_HOST static auto
+  _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH static auto
   call(thrust::detail::execution_policy_base<FromPolicy> const& from_exec,
        thrust::detail::execution_policy_base<ToPolicy> const& to_exec,
        ForwardIt&& first,
@@ -84,38 +85,43 @@ struct copy_fn final
       thrust::detail::derived_cast(thrust::detail::strip_const(to_exec)),
       THRUST_FWD(first),
       THRUST_FWD(last),
-      THRUST_FWD(output)))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
-      _CCCL_HOST static auto call(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                                  ForwardIt&& first,
-                                  Sentinel&& last,
-                                  OutputIt&& output)
-        THRUST_RETURNS(copy_fn::call(
-          thrust::detail::derived_cast(thrust::detail::strip_const(exec))
-          // Synthesize a suitable new execution policy, because we don't want to
-          // try and extract twice from the one we were passed.
-          ,
-          typename remove_cvref_t<decltype(thrust::detail::derived_cast(thrust::detail::strip_const(exec)))>::tag_type{},
+      THRUST_FWD(output))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
+    _CCCL_HOST static auto call(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                                ForwardIt&& first,
+                                Sentinel&& last,
+                                OutputIt&& output)
+      THRUST_RETURNS(copy_fn::call(
+        thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+        // Synthesize a suitable new execution policy, because we don't want to
+        // try and extract twice from the one we were passed.
+        ,
+        typename ::cuda::std::remove_cvref_t<
+          decltype(thrust::detail::derived_cast(thrust::detail::strip_const(exec)))>::tag_type{},
+        THRUST_FWD(first),
+        THRUST_FWD(last),
+        THRUST_FWD(output)))
+
+        template <typename ForwardIt, typename Sentinel, typename OutputIt>
+        _CCCL_HOST static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output) THRUST_RETURNS(copy_fn::call(
+          thrust::detail::select_system(
+            typename thrust::iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}),
+          thrust::detail::select_system(typename thrust::iterator_system<::cuda::std::remove_cvref_t<OutputIt>>::type{}),
           THRUST_FWD(first),
           THRUST_FWD(last),
           THRUST_FWD(output)))
 
-          template <typename ForwardIt, typename Sentinel, typename OutputIt>
-          _CCCL_HOST static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output)
-            THRUST_RETURNS(copy_fn::call(
-              thrust::detail::select_system(typename thrust::iterator_system<remove_cvref_t<ForwardIt>>::type{}),
-              thrust::detail::select_system(typename thrust::iterator_system<remove_cvref_t<OutputIt>>::type{}),
-              THRUST_FWD(first),
-              THRUST_FWD(last),
-              THRUST_FWD(output)))
-
-              template <typename... Args>
-              _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const THRUST_RETURNS(call(THRUST_FWD(args)...))
+          template <typename... Args>
+          CCCL_DEPRECATED _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const
+    THRUST_RETURNS(call(THRUST_FWD(args)...))
 };
 
 } // namespace copy_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT copy_detail::copy_fn copy{};
 
 /*! \endcond
diff --git a/thrust/thrust/async/for_each.h b/thrust/thrust/async/for_each.h
index 6128b6a7625..20bea60d887 100644
--- a/thrust/thrust/async/for_each.h
+++ b/thrust/thrust/async/for_each.h
@@ -36,7 +36,8 @@
 #  include <thrust/detail/static_assert.h>
 #  include <thrust/event.h>
 #  include <thrust/system/detail/adl/async/for_each.h>
-#  include <thrust/type_traits/remove_cvref.h>
+
+#  include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -50,7 +51,7 @@ namespace unimplemented
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename UnaryFunction>
-_CCCL_HOST event<DerivedPolicy>
+CCCL_DEPRECATED _CCCL_HOST event<DerivedPolicy>
 async_for_each(thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, UnaryFunction)
 {
   THRUST_STATIC_ASSERT_MSG((thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
@@ -68,7 +69,7 @@ using thrust::async::unimplemented::async_for_each;
 struct for_each_fn final
 {
   template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename UnaryFunction>
-  _CCCL_HOST static auto
+  _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH static auto
   call(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
        ForwardIt&& first,
        Sentinel&& last,
@@ -77,22 +78,25 @@ struct for_each_fn final
     THRUST_RETURNS(async_for_each(thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
                                   THRUST_FWD(first),
                                   THRUST_FWD(last),
-                                  THRUST_FWD(f)))
-
-      template <typename ForwardIt, typename Sentinel, typename UnaryFunction>
-      _CCCL_HOST static auto call(ForwardIt&& first, Sentinel&& last, UnaryFunction&& f)
-        THRUST_RETURNS(for_each_fn::call(
-          thrust::detail::select_system(typename iterator_system<remove_cvref_t<ForwardIt>>::type{}),
-          THRUST_FWD(first),
-          THRUST_FWD(last),
-          THRUST_FWD(f)))
-
-          template <typename... Args>
-          _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const THRUST_RETURNS(call(THRUST_FWD(args)...))
+                                  THRUST_FWD(f))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename ForwardIt, typename Sentinel, typename UnaryFunction>
+    _CCCL_HOST static auto call(ForwardIt&& first, Sentinel&& last, UnaryFunction&& f) THRUST_RETURNS(for_each_fn::call(
+      thrust::detail::select_system(typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(f)))
+
+      template <typename... Args>
+      CCCL_DEPRECATED _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const
+    THRUST_RETURNS(call(THRUST_FWD(args)...))
 };
 
 } // namespace for_each_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT for_each_detail::for_each_fn for_each{};
 
 /*! \endcond
diff --git a/thrust/thrust/async/reduce.h b/thrust/thrust/async/reduce.h
index a6cea12b5ab..9ae385059a8 100644
--- a/thrust/thrust/async/reduce.h
+++ b/thrust/thrust/async/reduce.h
@@ -38,7 +38,8 @@
 #  include <thrust/system/detail/adl/async/reduce.h>
 #  include <thrust/type_traits/is_execution_policy.h>
 #  include <thrust/type_traits/logical_metafunctions.h>
-#  include <thrust/type_traits/remove_cvref.h>
+
+#  include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -52,7 +53,7 @@ namespace unimplemented
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
-_CCCL_HOST future<DerivedPolicy, T>
+CCCL_DEPRECATED _CCCL_HOST future<DerivedPolicy, T>
 async_reduce(thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, T, BinaryOp)
 {
   THRUST_STATIC_ASSERT_MSG((thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
@@ -70,7 +71,7 @@ using thrust::async::unimplemented::async_reduce;
 struct reduce_fn final
 {
   template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
-  _CCCL_HOST static auto
+  _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH static auto
   call(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
        ForwardIt&& first,
        Sentinel&& last,
@@ -82,60 +83,63 @@ struct reduce_fn final
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(init),
-      THRUST_FWD(op)))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename T>
-      _CCCL_HOST static auto call4(
-        thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-        ForwardIt&& first,
-        Sentinel&& last,
-        T&& init,
-        thrust::true_type)
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename T>
+    _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH
+    static auto call4(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                      ForwardIt&& first,
+                      Sentinel&& last,
+                      T&& init,
+                      thrust::true_type)
     // ADL dispatch.
     THRUST_RETURNS(async_reduce(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(init),
-      thrust::plus<remove_cvref_t<T>>{}))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel>
-      _CCCL_HOST static auto call3(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                                   ForwardIt&& first,
-                                   Sentinel&& last,
-                                   thrust::true_type)
+      thrust::plus<::cuda::std::remove_cvref_t<T>>{})) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel>
+    _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH
+    static auto call3(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                      ForwardIt&& first,
+                      Sentinel&& last,
+                      thrust::true_type)
     // ADL dispatch.
     THRUST_RETURNS(async_reduce(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
       THRUST_FWD(first),
       THRUST_FWD(last),
-      typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{},
-      thrust::plus<remove_cvref_t<typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type>>{}))
-
-      template <typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
-      _CCCL_HOST static auto call4(ForwardIt&& first, Sentinel&& last, T&& init, BinaryOp&& op, thrust::false_type)
-        THRUST_RETURNS(reduce_fn::call(
-          thrust::detail::select_system(typename iterator_system<remove_cvref_t<ForwardIt>>::type{}),
-          THRUST_FWD(first),
-          THRUST_FWD(last),
-          THRUST_FWD(init),
-          THRUST_FWD(op)))
-
-          template <typename ForwardIt, typename Sentinel, typename T>
-          _CCCL_HOST static auto call3(ForwardIt&& first, Sentinel&& last, T&& init, thrust::false_type)
-            THRUST_RETURNS(reduce_fn::call(
-              thrust::detail::select_system(typename iterator_system<remove_cvref_t<ForwardIt>>::type{}),
-              THRUST_FWD(first),
-              THRUST_FWD(last),
-              THRUST_FWD(init),
-              thrust::plus<remove_cvref_t<T>>{}))
+      typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type{},
+      thrust::plus<
+        ::cuda::std::remove_cvref_t<typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type>>{}))
+      _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
+    _CCCL_HOST static auto call4(ForwardIt&& first, Sentinel&& last, T&& init, BinaryOp&& op, thrust::false_type)
+      THRUST_RETURNS(reduce_fn::call(
+        thrust::detail::select_system(typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}),
+        THRUST_FWD(first),
+        THRUST_FWD(last),
+        THRUST_FWD(init),
+        THRUST_FWD(op)))
+
+        template <typename ForwardIt, typename Sentinel, typename T>
+        _CCCL_HOST static auto call3(ForwardIt&& first, Sentinel&& last, T&& init, thrust::false_type)
+          THRUST_RETURNS(reduce_fn::call(
+            thrust::detail::select_system(typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}),
+            THRUST_FWD(first),
+            THRUST_FWD(last),
+            THRUST_FWD(init),
+            thrust::plus<::cuda::std::remove_cvref_t<T>>{}))
 
     // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
     // if T1 is an execution_policy by using SFINAE. Switching to a static
     // dispatch pattern to prevent this.
     template <typename T1, typename T2, typename T3>
     _CCCL_HOST static auto call(T1&& t1, T2&& t2, T3&& t3) THRUST_RETURNS(reduce_fn::call3(
-      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{}))
+      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), thrust::is_execution_policy<::cuda::std::remove_cvref_t<T1>>{}))
 
       template <typename T1, typename T2, typename T3, typename T4>
       _CCCL_HOST static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4) THRUST_RETURNS(reduce_fn::call4(
@@ -143,22 +147,27 @@ struct reduce_fn final
         THRUST_FWD(t2),
         THRUST_FWD(t3),
         THRUST_FWD(t4),
-        thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{}))
+        thrust::is_execution_policy<::cuda::std::remove_cvref_t<T1>>{}))
 
         template <typename ForwardIt, typename Sentinel>
         _CCCL_HOST static auto call(ForwardIt&& first, Sentinel&& last) THRUST_RETURNS(reduce_fn::call(
-          thrust::detail::select_system(typename iterator_system<remove_cvref_t<ForwardIt>>::type{}),
+          thrust::detail::select_system(typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}),
           THRUST_FWD(first),
           THRUST_FWD(last),
-          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{},
-          thrust::plus<remove_cvref_t<typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type>>{}))
+          typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type{},
+          thrust::plus<::cuda::std::remove_cvref_t<
+            typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type>>{}))
 
           template <typename... Args>
-          _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const THRUST_RETURNS(call(THRUST_FWD(args)...))
+          CCCL_DEPRECATED _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const
+    THRUST_RETURNS(call(THRUST_FWD(args)...))
 };
 
 } // namespace reduce_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT reduce_detail::reduce_fn reduce{};
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -167,7 +176,7 @@ namespace unimplemented
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename T, typename BinaryOp>
-_CCCL_HOST event<DerivedPolicy>
+CCCL_DEPRECATED _CCCL_HOST event<DerivedPolicy>
 async_reduce_into(thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, OutputIt, T, BinaryOp)
 {
   THRUST_STATIC_ASSERT_MSG((thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
@@ -185,7 +194,7 @@ using thrust::async::unimplemented::async_reduce_into;
 struct reduce_into_fn final
 {
   template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename T, typename BinaryOp>
-  _CCCL_HOST static auto
+  _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH static auto
   call(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
        ForwardIt&& first,
        Sentinel&& last,
@@ -199,16 +208,16 @@ struct reduce_into_fn final
       THRUST_FWD(last),
       THRUST_FWD(output),
       THRUST_FWD(init),
-      THRUST_FWD(op)))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename T>
-      _CCCL_HOST static auto call5(
-        thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-        ForwardIt&& first,
-        Sentinel&& last,
-        OutputIt&& output,
-        T&& init,
-        thrust::true_type)
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename T>
+    _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH static auto call5(
+      thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+      ForwardIt&& first,
+      Sentinel&& last,
+      OutputIt&& output,
+      T&& init,
+      thrust::true_type)
     // ADL dispatch.
     THRUST_RETURNS(async_reduce_into(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
@@ -216,58 +225,60 @@ struct reduce_into_fn final
       THRUST_FWD(last),
       THRUST_FWD(output),
       THRUST_FWD(init),
-      thrust::plus<remove_cvref_t<T>>{}))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
-      _CCCL_HOST static auto call4(
-        thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-        ForwardIt&& first,
-        Sentinel&& last,
-        OutputIt&& output,
-        thrust::true_type)
+      thrust::plus<::cuda::std::remove_cvref_t<T>>{})) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
+    _CCCL_HOST _CCCL_SUPPRESS_DEPRECATED_PUSH
+    static auto call4(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                      ForwardIt&& first,
+                      Sentinel&& last,
+                      OutputIt&& output,
+                      thrust::true_type)
     // ADL dispatch.
     THRUST_RETURNS(async_reduce_into(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(output),
-      typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{},
-      thrust::plus<remove_cvref_t<typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type>>{}))
-
-      template <typename ForwardIt, typename Sentinel, typename OutputIt, typename T, typename BinaryOp>
-      _CCCL_HOST static auto call5(
-        ForwardIt&& first, Sentinel&& last, OutputIt&& output, T&& init, BinaryOp&& op, thrust::false_type)
-        THRUST_RETURNS(reduce_into_fn::call(
-          thrust::detail::select_system(typename iterator_system<remove_cvref_t<ForwardIt>>::type{},
-                                        typename iterator_system<remove_cvref_t<OutputIt>>::type{}),
-          THRUST_FWD(first),
-          THRUST_FWD(last),
-          THRUST_FWD(output),
-          THRUST_FWD(init),
-          THRUST_FWD(op)))
-
-          template <typename ForwardIt, typename Sentinel, typename OutputIt, typename T>
-          _CCCL_HOST static auto call4(
-            ForwardIt&& first, Sentinel&& last, OutputIt&& output, T&& init, thrust::false_type)
-            THRUST_RETURNS(reduce_into_fn::call(
-              thrust::detail::select_system(typename iterator_system<remove_cvref_t<ForwardIt>>::type{},
-                                            typename iterator_system<remove_cvref_t<OutputIt>>::type{}),
-              THRUST_FWD(first),
-              THRUST_FWD(last),
-              THRUST_FWD(output),
-              THRUST_FWD(init),
-              thrust::plus<remove_cvref_t<T>>{}))
-
-              template <typename ForwardIt, typename Sentinel, typename OutputIt>
-              _CCCL_HOST static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output)
-                THRUST_RETURNS(reduce_into_fn::call(
-                  thrust::detail::select_system(typename iterator_system<remove_cvref_t<ForwardIt>>::type{},
-                                                typename iterator_system<remove_cvref_t<OutputIt>>::type{}),
-                  THRUST_FWD(first),
-                  THRUST_FWD(last),
-                  THRUST_FWD(output),
-                  typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{},
-                  thrust::plus<remove_cvref_t<typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type>>{}))
+      typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type{},
+      thrust::plus<
+        ::cuda::std::remove_cvref_t<typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type>>{}))
+      _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename ForwardIt, typename Sentinel, typename OutputIt, typename T, typename BinaryOp>
+    _CCCL_HOST static auto call5(
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output, T&& init, BinaryOp&& op, thrust::false_type)
+      THRUST_RETURNS(reduce_into_fn::call(
+        thrust::detail::select_system(typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{},
+                                      typename iterator_system<::cuda::std::remove_cvref_t<OutputIt>>::type{}),
+        THRUST_FWD(first),
+        THRUST_FWD(last),
+        THRUST_FWD(output),
+        THRUST_FWD(init),
+        THRUST_FWD(op)))
+
+        template <typename ForwardIt, typename Sentinel, typename OutputIt, typename T>
+        _CCCL_HOST static auto call4(ForwardIt&& first, Sentinel&& last, OutputIt&& output, T&& init, thrust::false_type)
+          THRUST_RETURNS(reduce_into_fn::call(
+            thrust::detail::select_system(typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{},
+                                          typename iterator_system<::cuda::std::remove_cvref_t<OutputIt>>::type{}),
+            THRUST_FWD(first),
+            THRUST_FWD(last),
+            THRUST_FWD(output),
+            THRUST_FWD(init),
+            thrust::plus<::cuda::std::remove_cvref_t<T>>{}))
+
+            template <typename ForwardIt, typename Sentinel, typename OutputIt>
+            _CCCL_HOST static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output)
+              THRUST_RETURNS(reduce_into_fn::call(
+                thrust::detail::select_system(typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{},
+                                              typename iterator_system<::cuda::std::remove_cvref_t<OutputIt>>::type{}),
+                THRUST_FWD(first),
+                THRUST_FWD(last),
+                THRUST_FWD(output),
+                typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type{},
+                thrust::plus<::cuda::std::remove_cvref_t<
+                  typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type>>{}))
 
     // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
     // if T1 is an execution_policy by using SFINAE. Switching to a static
@@ -278,7 +289,7 @@ struct reduce_into_fn final
       THRUST_FWD(t2),
       THRUST_FWD(t3),
       THRUST_FWD(t4),
-      thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{}))
+      thrust::is_execution_policy<::cuda::std::remove_cvref_t<T1>>{}))
 
       template <typename T1, typename T2, typename T3, typename T4, typename T5>
       _CCCL_HOST static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4, T5&& t5) THRUST_RETURNS(reduce_into_fn::call5(
@@ -287,14 +298,18 @@ struct reduce_into_fn final
         THRUST_FWD(t3),
         THRUST_FWD(t4),
         THRUST_FWD(t5),
-        thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{}))
+        thrust::is_execution_policy<::cuda::std::remove_cvref_t<T1>>{}))
 
         template <typename... Args>
-        _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const THRUST_RETURNS(call(THRUST_FWD(args)...))
+        CCCL_DEPRECATED _CCCL_NODISCARD _CCCL_HOST auto operator()(Args&&... args) const
+    THRUST_RETURNS(call(THRUST_FWD(args)...))
 };
 
 } // namespace reduce_into_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT reduce_into_detail::reduce_into_fn reduce_into{};
 
 /*! \endcond
diff --git a/thrust/thrust/async/scan.h b/thrust/thrust/async/scan.h
index 4963d17225e..5a1e4c2d283 100644
--- a/thrust/thrust/async/scan.h
+++ b/thrust/thrust/async/scan.h
@@ -39,7 +39,8 @@
 #  include <thrust/system/detail/adl/async/scan.h>
 #  include <thrust/type_traits/is_execution_policy.h>
 #  include <thrust/type_traits/logical_metafunctions.h>
-#  include <thrust/type_traits/remove_cvref.h>
+
+#  include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -51,7 +52,7 @@ namespace unimplemented
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename BinaryOp>
-event<DerivedPolicy>
+CCCL_DEPRECATED event<DerivedPolicy>
 async_inclusive_scan(thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, OutputIt, BinaryOp)
 {
   THRUST_STATIC_ASSERT_MSG((thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
@@ -65,7 +66,7 @@ template <typename DerivedPolicy,
           typename OutputIt,
           typename InitialValueType,
           typename BinaryOp>
-event<DerivedPolicy> async_exclusive_scan(
+CCCL_DEPRECATED event<DerivedPolicy> async_exclusive_scan(
   thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, OutputIt, InitialValueType, BinaryOp)
 {
   THRUST_STATIC_ASSERT_MSG((thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
@@ -85,44 +86,47 @@ using thrust::async::unimplemented::async_inclusive_scan;
 struct inclusive_scan_fn final
 {
   template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename BinaryOp>
-  auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                  ForwardIt&& first,
-                  Sentinel&& last,
-                  OutputIt&& out,
-                  BinaryOp&& op) const
+  CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH auto operator()(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+    ForwardIt&& first,
+    Sentinel&& last,
+    OutputIt&& out,
+    BinaryOp&& op) const
     // ADL dispatch.
     THRUST_RETURNS(async_inclusive_scan(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
-      THRUST_FWD(op)))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
-      auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                      ForwardIt&& first,
-                      Sentinel&& last,
-                      OutputIt&& out) const
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                    ForwardIt&& first,
+                    Sentinel&& last,
+                    OutputIt&& out) const
     // ADL dispatch.
     THRUST_RETURNS(async_inclusive_scan(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
-      thrust::plus<>{}))
-
-      template <typename DerivedPolicy,
-                typename ForwardIt,
-                typename Sentinel,
-                typename OutputIt,
-                typename InitialValueType,
-                typename BinaryOp>
-      auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                      ForwardIt&& first,
-                      Sentinel&& last,
-                      OutputIt&& out,
-                      InitialValueType&& init,
-                      BinaryOp&& op) const
+      thrust::plus<>{})) _CCCL_SUPPRESS_DEPRECATED_PUSH
+
+    template <typename DerivedPolicy,
+              typename ForwardIt,
+              typename Sentinel,
+              typename OutputIt,
+              typename InitialValueType,
+              typename BinaryOp>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH auto operator()(
+      thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+      ForwardIt&& first,
+      Sentinel&& last,
+      OutputIt&& out,
+      InitialValueType&& init,
+      BinaryOp&& op) const
     // ADL dispatch.
     THRUST_RETURNS(async_inclusive_scan(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
@@ -130,54 +134,60 @@ struct inclusive_scan_fn final
       THRUST_FWD(last),
       THRUST_FWD(out),
       THRUST_FWD(init),
-      THRUST_FWD(op)))
-
-      template <typename ForwardIt,
-                typename Sentinel,
-                typename OutputIt,
-                typename BinaryOp,
-                typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
-      auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, BinaryOp&& op) const
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename ForwardIt,
+              typename Sentinel,
+              typename OutputIt,
+              typename BinaryOp,
+              typename = std::enable_if_t<!is_execution_policy_v<::cuda::std::remove_cvref_t<ForwardIt>>>>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, BinaryOp&& op) const
     // ADL dispatch.
     THRUST_RETURNS(async_inclusive_scan(
-      thrust::detail::select_system(iterator_system_t<remove_cvref_t<ForwardIt>>{},
-                                    iterator_system_t<remove_cvref_t<OutputIt>>{}),
+      thrust::detail::select_system(iterator_system_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+                                    iterator_system_t<::cuda::std::remove_cvref_t<OutputIt>>{}),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
-      THRUST_FWD(op)))
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
 
-      template <typename ForwardIt, typename Sentinel, typename OutputIt>
-      auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
+    template <typename ForwardIt, typename Sentinel, typename OutputIt>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
     // ADL dispatch.
     THRUST_RETURNS(async_inclusive_scan(
-      thrust::detail::select_system(iterator_system_t<remove_cvref_t<ForwardIt>>{},
-                                    iterator_system_t<remove_cvref_t<OutputIt>>{}),
+      thrust::detail::select_system(iterator_system_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+                                    iterator_system_t<::cuda::std::remove_cvref_t<OutputIt>>{}),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
-      thrust::plus<>{}))
-
-      template <typename ForwardIt,
-                typename Sentinel,
-                typename OutputIt,
-                typename InitialValueType,
-                typename BinaryOp,
-                typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
-      auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init, BinaryOp&& op) const
+      thrust::plus<>{})) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename ForwardIt,
+              typename Sentinel,
+              typename OutputIt,
+              typename InitialValueType,
+              typename BinaryOp,
+              typename = std::enable_if_t<!is_execution_policy_v<::cuda::std::remove_cvref_t<ForwardIt>>>>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init, BinaryOp&& op) const
     // ADL dispatch.
     THRUST_RETURNS(async_inclusive_scan(
-      thrust::detail::select_system(iterator_system_t<remove_cvref_t<ForwardIt>>{},
-                                    iterator_system_t<remove_cvref_t<OutputIt>>{}),
+      thrust::detail::select_system(iterator_system_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+                                    iterator_system_t<::cuda::std::remove_cvref_t<OutputIt>>{}),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
       THRUST_FWD(init),
-      THRUST_FWD(op)))
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
 };
 
 } // namespace inclusive_scan_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
 
 namespace exclusive_scan_detail
@@ -195,12 +205,13 @@ struct exclusive_scan_fn final
             typename OutputIt,
             typename InitialValueType,
             typename BinaryOp>
-  auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                  ForwardIt&& first,
-                  Sentinel&& last,
-                  OutputIt&& out,
-                  InitialValueType&& init,
-                  BinaryOp&& op) const
+  CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH auto operator()(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+    ForwardIt&& first,
+    Sentinel&& last,
+    OutputIt&& out,
+    InitialValueType&& init,
+    BinaryOp&& op) const
     // ADL dispatch.
     THRUST_RETURNS(async_exclusive_scan(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
@@ -208,14 +219,15 @@ struct exclusive_scan_fn final
       THRUST_FWD(last),
       THRUST_FWD(out),
       THRUST_FWD(init),
-      THRUST_FWD(op)))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename InitialValueType>
-      auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                      ForwardIt&& first,
-                      Sentinel&& last,
-                      OutputIt&& out,
-                      InitialValueType&& init) const
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename InitialValueType>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                    ForwardIt&& first,
+                    Sentinel&& last,
+                    OutputIt&& out,
+                    InitialValueType&& init) const
     // ADL dispatch.
     THRUST_RETURNS(async_exclusive_scan(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
@@ -223,70 +235,77 @@ struct exclusive_scan_fn final
       THRUST_FWD(last),
       THRUST_FWD(out),
       THRUST_FWD(init),
-      thrust::plus<>{}))
-
-      template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
-      auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
-                      ForwardIt&& first,
-                      Sentinel&& last,
-                      OutputIt&& out) const
+      thrust::plus<>{})) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                    ForwardIt&& first,
+                    Sentinel&& last,
+                    OutputIt&& out) const
     // ADL dispatch.
     THRUST_RETURNS(async_exclusive_scan(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
-      iterator_value_t<remove_cvref_t<ForwardIt>>{},
-      thrust::plus<>{}))
-
-      template <typename ForwardIt,
-                typename Sentinel,
-                typename OutputIt,
-                typename InitialValueType,
-                typename BinaryOp,
-                typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
-      auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init, BinaryOp&& op) const
+      iterator_value_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{})) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename ForwardIt,
+              typename Sentinel,
+              typename OutputIt,
+              typename InitialValueType,
+              typename BinaryOp,
+              typename = std::enable_if_t<!is_execution_policy_v<::cuda::std::remove_cvref_t<ForwardIt>>>>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init, BinaryOp&& op) const
     // ADL dispatch.
     THRUST_RETURNS(async_exclusive_scan(
-      thrust::detail::select_system(iterator_system_t<remove_cvref_t<ForwardIt>>{},
-                                    iterator_system_t<remove_cvref_t<OutputIt>>{}),
+      thrust::detail::select_system(iterator_system_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+                                    iterator_system_t<::cuda::std::remove_cvref_t<OutputIt>>{}),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
       THRUST_FWD(init),
-      THRUST_FWD(op)))
-
-      template <typename ForwardIt,
-                typename Sentinel,
-                typename OutputIt,
-                typename InitialValueType,
-                typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
-      auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init) const
+      THRUST_FWD(op))) _CCCL_SUPPRESS_DEPRECATED_POP
+
+    template <typename ForwardIt,
+              typename Sentinel,
+              typename OutputIt,
+              typename InitialValueType,
+              typename = std::enable_if_t<!is_execution_policy_v<::cuda::std::remove_cvref_t<ForwardIt>>>>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init) const
     // ADL dispatch.
     THRUST_RETURNS(async_exclusive_scan(
-      thrust::detail::select_system(iterator_system_t<remove_cvref_t<ForwardIt>>{},
-                                    iterator_system_t<remove_cvref_t<OutputIt>>{}),
+      thrust::detail::select_system(iterator_system_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+                                    iterator_system_t<::cuda::std::remove_cvref_t<OutputIt>>{}),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
       THRUST_FWD(init),
-      thrust::plus<>{}))
+      thrust::plus<>{})) _CCCL_SUPPRESS_DEPRECATED_POP
 
-      template <typename ForwardIt, typename Sentinel, typename OutputIt>
-      auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
+    template <typename ForwardIt, typename Sentinel, typename OutputIt>
+    CCCL_DEPRECATED _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
     // ADL dispatch.
     THRUST_RETURNS(async_exclusive_scan(
-      thrust::detail::select_system(iterator_system_t<remove_cvref_t<ForwardIt>>{},
-                                    iterator_system_t<remove_cvref_t<OutputIt>>{}),
+      thrust::detail::select_system(iterator_system_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+                                    iterator_system_t<::cuda::std::remove_cvref_t<OutputIt>>{}),
       THRUST_FWD(first),
       THRUST_FWD(last),
       THRUST_FWD(out),
-      iterator_value_t<remove_cvref_t<ForwardIt>>{},
-      thrust::plus<>{}))
+      iterator_value_t<::cuda::std::remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{})) _CCCL_SUPPRESS_DEPRECATED_POP
 };
 
 } // namespace exclusive_scan_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
 
 } // namespace async
diff --git a/thrust/thrust/async/sort.h b/thrust/thrust/async/sort.h
index ae37abb5d61..0a2b7d16ead 100644
--- a/thrust/thrust/async/sort.h
+++ b/thrust/thrust/async/sort.h
@@ -38,7 +38,8 @@
 #  include <thrust/system/detail/adl/async/sort.h>
 #  include <thrust/type_traits/is_execution_policy.h>
 #  include <thrust/type_traits/logical_metafunctions.h>
-#  include <thrust/type_traits/remove_cvref.h>
+
+#  include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -52,7 +53,7 @@ namespace unimplemented
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
-_CCCL_HOST event<DerivedPolicy>
+CCCL_DEPRECATED _CCCL_HOST event<DerivedPolicy>
 async_stable_sort(thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, StrictWeakOrdering)
 {
   THRUST_STATIC_ASSERT_MSG((thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
@@ -75,6 +76,7 @@ struct stable_sort_fn final
   , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
   >
   _CCCL_HOST
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   static auto call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
@@ -88,12 +90,14 @@ struct stable_sort_fn final
     , THRUST_FWD(comp)
     )
   )
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   template <
     typename DerivedPolicy
   , typename ForwardIt, typename Sentinel
   >
   _CCCL_HOST
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   static auto call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
@@ -104,10 +108,11 @@ struct stable_sort_fn final
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
     , thrust::less<
-        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type
       >{}
     )
   )
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   template <typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
   _CCCL_HOST
@@ -115,7 +120,7 @@ struct stable_sort_fn final
   THRUST_RETURNS(
     stable_sort_fn::call(
       thrust::detail::select_system(
-        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+        typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}
       )
     , THRUST_FWD(first), THRUST_FWD(last)
     , THRUST_FWD(comp)
@@ -129,14 +134,17 @@ struct stable_sort_fn final
     stable_sort_fn::call(
       THRUST_FWD(first), THRUST_FWD(last)
     , thrust::less<
-        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type
       >{}
     )
   )
 
   template <typename... Args>
+  #if !_CCCL_CUDA_COMPILER(CLANG)
+  // clang in CUDA mode can only handle one attribute
   _CCCL_NODISCARD _CCCL_HOST
-  auto operator()(Args&&... args) const
+  #endif
+ CCCL_DEPRECATED auto operator()(Args&&... args) const
   THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
@@ -145,13 +153,16 @@ struct stable_sort_fn final
 
 } // namespace stable_sort_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT stable_sort_detail::stable_sort_fn stable_sort{};
 
 namespace fallback
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
-_CCCL_HOST event<DerivedPolicy>
+CCCL_DEPRECATED _CCCL_HOST event<DerivedPolicy>
 async_sort(thrust::execution_policy<DerivedPolicy>& exec, ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp)
 {
   return async_stable_sort(thrust::detail::derived_cast(exec), THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(comp));
@@ -167,6 +178,7 @@ using thrust::async::fallback::async_sort;
 // clang-format off
 struct sort_fn final
 {
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   template <
     typename DerivedPolicy
   , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
@@ -185,6 +197,7 @@ struct sort_fn final
     , THRUST_FWD(comp)
     )
   )
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   template <
     typename DerivedPolicy
@@ -201,7 +214,7 @@ struct sort_fn final
       exec
     , THRUST_FWD(first), THRUST_FWD(last)
     , thrust::less<
-        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type
       >{}
     )
   )
@@ -214,7 +227,7 @@ struct sort_fn final
   THRUST_RETURNS(
     sort_fn::call(
       thrust::detail::select_system(
-        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+        typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}
       )
     , THRUST_FWD(first), THRUST_FWD(last)
     , THRUST_FWD(comp)
@@ -229,7 +242,7 @@ struct sort_fn final
   static auto call(T1&& t1, T2&& t2, T3&& t3)
   THRUST_RETURNS(
     sort_fn::call3(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3),
-                   thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+                   thrust::is_execution_policy<::cuda::std::remove_cvref_t<T1>>{})
   )
 
   template <typename ForwardIt, typename Sentinel>
@@ -238,18 +251,21 @@ struct sort_fn final
   THRUST_RETURNS(
     sort_fn::call(
       thrust::detail::select_system(
-        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+        typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}
       )
     , THRUST_FWD(first), THRUST_FWD(last)
     , thrust::less<
-        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        typename iterator_traits<::cuda::std::remove_cvref_t<ForwardIt>>::value_type
       >{}
     )
   )
 
   template <typename... Args>
+  #if !_CCCL_CUDA_COMPILER(CLANG)
+  // clang in CUDA mode can only handle one attribute
   _CCCL_NODISCARD _CCCL_HOST
-  auto operator()(Args&&... args) const
+  #endif
+ CCCL_DEPRECATED auto operator()(Args&&... args) const
   THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
@@ -258,6 +274,9 @@ struct sort_fn final
 
 } // namespace sort_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT sort_detail::sort_fn sort{};
 
 /*! \endcond
diff --git a/thrust/thrust/async/transform.h b/thrust/thrust/async/transform.h
index 0862141ee0d..34b88a71531 100644
--- a/thrust/thrust/async/transform.h
+++ b/thrust/thrust/async/transform.h
@@ -36,7 +36,8 @@
 #  include <thrust/detail/static_assert.h>
 #  include <thrust/event.h>
 #  include <thrust/system/detail/adl/async/transform.h>
-#  include <thrust/type_traits/remove_cvref.h>
+
+#  include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -50,7 +51,7 @@ namespace unimplemented
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename UnaryOperation>
-_CCCL_HOST event<DerivedPolicy> async_transform(
+CCCL_DEPRECATED _CCCL_HOST event<DerivedPolicy> async_transform(
   thrust::execution_policy<DerivedPolicy>& exec, ForwardIt first, Sentinel last, OutputIt output, UnaryOperation op)
 {
   THRUST_STATIC_ASSERT_MSG((thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
@@ -74,6 +75,7 @@ struct transform_fn final
   , typename UnaryOperation
   >
   _CCCL_HOST
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   static auto
   call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
@@ -90,6 +92,7 @@ struct transform_fn final
     , THRUST_FWD(op)
     )
   )
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   template <
     typename ForwardIt, typename Sentinel, typename OutputIt
@@ -104,8 +107,8 @@ struct transform_fn final
   THRUST_RETURNS(
     transform_fn::call(
       thrust::detail::select_system(
-        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
-      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+        typename iterator_system<::cuda::std::remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<::cuda::std::remove_cvref_t<OutputIt>>::type{}
       )
     , THRUST_FWD(first), THRUST_FWD(last)
     , THRUST_FWD(output)
@@ -114,8 +117,8 @@ struct transform_fn final
   )
 
   template <typename... Args>
-  _CCCL_NODISCARD _CCCL_HOST
-  auto operator()(Args&&... args) const
+  CCCL_DEPRECATED _CCCL_NODISCARD _CCCL_HOST
+ auto operator()(Args&&... args) const
   THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
@@ -124,6 +127,9 @@ struct transform_fn final
 
 } // namespace transform_detail
 
+// note: cannot add a CCCL_DEPRECATED here because the global variable is emitted into cudafe1.stub.c and we cannot
+// suppress the warning there
+//! deprecated [Since 2.8.0]
 _CCCL_GLOBAL_CONSTANT transform_detail::transform_fn transform{};
 
 /*! \endcond
diff --git a/thrust/thrust/detail/allocator/allocator_traits.inl b/thrust/thrust/detail/allocator/allocator_traits.inl
index f786f69c885..ab3f76030fc 100644
--- a/thrust/thrust/detail/allocator/allocator_traits.inl
+++ b/thrust/thrust/detail/allocator/allocator_traits.inl
@@ -26,11 +26,12 @@
 #  pragma system_header
 #endif // no system header
 #include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/integer_traits.h>
 #include <thrust/detail/memory_wrapper.h>
 #include <thrust/detail/type_deduction.h>
 #include <thrust/detail/type_traits/is_call_possible.h>
 
+#include <cuda/std/limits>
+
 #include <new>
 
 THRUST_NAMESPACE_BEGIN
@@ -264,7 +265,7 @@ _CCCL_HOST_DEVICE ::cuda::std::enable_if_t<!has_member_max_size<Alloc>::value,
 max_size(const Alloc&)
 {
   using size_type = typename allocator_traits<Alloc>::size_type;
-  return thrust::detail::integer_traits<size_type>::const_max;
+  return ::cuda::std::numeric_limits<size_type>::max();
 }
 
 template <typename Alloc>
diff --git a/thrust/thrust/detail/config/cpp_dialect.h b/thrust/thrust/detail/config/cpp_dialect.h
index 62850f61465..29b9a473cc1 100644
--- a/thrust/thrust/detail/config/cpp_dialect.h
+++ b/thrust/thrust/detail/config/cpp_dialect.h
@@ -78,8 +78,7 @@ THRUST_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
 // C++17 dialect check:
 #ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT
 #  if _CCCL_STD_VER < 2017
-THRUST_COMP_DEPR_IMPL(
-  Thrust requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+#    error Thrust requires at least C++17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.
 #  endif // _CCCL_STD_VER >= 2017
 #endif
 
diff --git a/thrust/thrust/detail/contiguous_storage.h b/thrust/thrust/detail/contiguous_storage.h
index 0092f0a1f61..2391a3a968a 100644
--- a/thrust/thrust/detail/contiguous_storage.h
+++ b/thrust/thrust/detail/contiguous_storage.h
@@ -30,6 +30,10 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/iterator/detail/normal_iterator.h>
 
+#include <cuda/std/utility>
+
+#include <stdexcept>
+
 THRUST_NAMESPACE_BEGIN
 
 namespace detail
@@ -38,6 +42,13 @@ namespace detail
 struct copy_allocator_t
 {};
 
+struct allocator_mismatch_on_swap : std::runtime_error
+{
+  allocator_mismatch_on_swap()
+      : std::runtime_error("swap called on containers with allocators that propagate on swap, but compare non-equal")
+  {}
+};
+
 // XXX parameter T is redundant with parameter Alloc
 template <typename T, typename Alloc>
 class contiguous_storage
@@ -70,6 +81,8 @@ class contiguous_storage
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_HOST_DEVICE explicit contiguous_storage(copy_allocator_t, const contiguous_storage& other, size_type n);
 
+  contiguous_storage& operator=(const contiguous_storage& x) = delete;
+
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_HOST_DEVICE ~contiguous_storage();
 
@@ -100,7 +113,29 @@ class contiguous_storage
 
   _CCCL_HOST_DEVICE void deallocate() noexcept;
 
-  _CCCL_HOST_DEVICE void swap(contiguous_storage& x);
+  _CCCL_EXEC_CHECK_DISABLE
+  _CCCL_HOST_DEVICE void swap(contiguous_storage& other)
+  {
+    using ::cuda::std::swap;
+    swap(m_begin, other.m_begin);
+    swap(m_size, other.m_size);
+
+    // From C++ standard [container.reqmts]
+    //   If allocator_traits<allocator_type>::propagate_on_container_swap::value is true, then allocator_type
+    //   shall meet the Cpp17Swappable requirements and the allocators of a and b shall also be exchanged by calling
+    //   swap as described in [swappable.requirements]. Otherwise, the allocators shall not be swapped, and the behavior
+    //   is undefined unless a.get_allocator() == b.get_allocator().
+    if constexpr (allocator_traits<Alloc>::propagate_on_container_swap::value)
+    {
+      swap(m_allocator, other.m_allocator);
+    }
+    else if constexpr (!allocator_traits<Alloc>::is_always_equal::value)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (assert(m_allocator == other);), (if (m_allocator != other.m_allocator) {
+                     throw allocator_mismatch_on_swap();
+                   }));
+    }
+  }
 
   _CCCL_HOST_DEVICE void value_initialize_n(iterator first, size_type n);
 
@@ -122,23 +157,56 @@ class contiguous_storage
 
   _CCCL_HOST_DEVICE void destroy(iterator first, iterator last) noexcept;
 
-  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch(const contiguous_storage& other) noexcept;
+  _CCCL_EXEC_CHECK_DISABLE
+  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch(const contiguous_storage& other) noexcept
+  {
+    if constexpr (allocator_traits<Alloc>::propagate_on_container_copy_assignment::value)
+    {
+      if (m_allocator != other.m_allocator)
+      {
+        deallocate();
+      }
+    }
+  }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_HOST_DEVICE void
-  destroy_on_allocator_mismatch(const contiguous_storage& other, iterator first, iterator last) noexcept;
+  destroy_on_allocator_mismatch(const contiguous_storage& other, iterator first, iterator last) noexcept
+  {
+    if constexpr (allocator_traits<Alloc>::propagate_on_container_copy_assignment::value)
+    {
+      if (m_allocator != other.m_allocator)
+      {
+        destroy(first, last);
+      }
+    }
+#if _CCCL_COMPILER(GCC, <, 10)
+    (void) first;
+    (void) last;
+#endif
+  }
 
   _CCCL_HOST_DEVICE void set_allocator(const allocator_type& alloc);
 
-  _CCCL_HOST_DEVICE bool is_allocator_not_equal(const allocator_type& alloc) const;
-
-  _CCCL_HOST_DEVICE bool is_allocator_not_equal(const contiguous_storage& other) const;
-
-  _CCCL_HOST_DEVICE void propagate_allocator(const contiguous_storage& other);
+  _CCCL_EXEC_CHECK_DISABLE
+  _CCCL_HOST_DEVICE void propagate_allocator(const contiguous_storage& other)
+  {
+    if constexpr (allocator_traits<Alloc>::propagate_on_container_copy_assignment::value)
+    {
+      m_allocator = other.m_allocator;
+    }
+  }
 
-  _CCCL_HOST_DEVICE void propagate_allocator(contiguous_storage& other);
+  _CCCL_EXEC_CHECK_DISABLE
+  _CCCL_HOST_DEVICE void propagate_allocator(contiguous_storage& other)
+  {
+    if constexpr (allocator_traits<Alloc>::propagate_on_container_move_assignment::value)
+    {
+      m_allocator = ::cuda::std::move(other.m_allocator);
+    }
+  }
 
   // allow move assignment for a sane implementation of allocator propagation
-  // on move assignment
   _CCCL_HOST_DEVICE contiguous_storage& operator=(contiguous_storage&& other);
 
   _CCCL_SYNTHESIZE_SEQUENCE_ACCESS(contiguous_storage, const_iterator);
@@ -151,35 +219,6 @@ class contiguous_storage
 
   size_type m_size;
 
-  // disallow assignment
-  contiguous_storage& operator=(const contiguous_storage& x);
-
-  _CCCL_HOST_DEVICE void swap_allocators(true_type, const allocator_type&);
-
-  _CCCL_HOST_DEVICE void swap_allocators(false_type, allocator_type&);
-
-  _CCCL_HOST_DEVICE bool is_allocator_not_equal_dispatch(true_type, const allocator_type&) const;
-
-  _CCCL_HOST_DEVICE bool is_allocator_not_equal_dispatch(false_type, const allocator_type&) const;
-
-  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage& other) noexcept;
-
-  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage& other) noexcept;
-
-  _CCCL_HOST_DEVICE void destroy_on_allocator_mismatch_dispatch(
-    true_type, const contiguous_storage& other, iterator first, iterator last) noexcept;
-
-  _CCCL_HOST_DEVICE void destroy_on_allocator_mismatch_dispatch(
-    false_type, const contiguous_storage& other, iterator first, iterator last) noexcept;
-
-  _CCCL_HOST_DEVICE void propagate_allocator_dispatch(true_type, const contiguous_storage& other);
-
-  _CCCL_HOST_DEVICE void propagate_allocator_dispatch(false_type, const contiguous_storage& other);
-
-  _CCCL_HOST_DEVICE void propagate_allocator_dispatch(true_type, contiguous_storage& other);
-
-  _CCCL_HOST_DEVICE void propagate_allocator_dispatch(false_type, contiguous_storage& other);
-
   friend _CCCL_HOST_DEVICE void swap(contiguous_storage& lhs, contiguous_storage& rhs) noexcept(noexcept(lhs.swap(rhs)))
   {
     lhs.swap(rhs);
diff --git a/thrust/thrust/detail/contiguous_storage.inl b/thrust/thrust/detail/contiguous_storage.inl
index f85638ffb6f..df78e0e0fc6 100644
--- a/thrust/thrust/detail/contiguous_storage.inl
+++ b/thrust/thrust/detail/contiguous_storage.inl
@@ -25,15 +25,13 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/allocator/allocator_traits.h>
+
 #include <thrust/detail/allocator/copy_construct_range.h>
 #include <thrust/detail/allocator/destroy_range.h>
 #include <thrust/detail/allocator/fill_construct_range.h>
 #include <thrust/detail/allocator/value_initialize_range.h>
 #include <thrust/detail/contiguous_storage.h>
 
-#include <cuda/std/utility>
-
 #include <stdexcept> // for std::runtime_error
 #include <utility> // for use of std::swap in the WAR below
 
@@ -44,14 +42,6 @@ THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
-class allocator_mismatch_on_swap : public std::runtime_error
-{
-public:
-  allocator_mismatch_on_swap()
-      : std::runtime_error("swap called on containers with allocators that propagate on swap, but compare non-equal")
-  {}
-};
-
 _CCCL_EXEC_CHECK_DISABLE
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE contiguous_storage<T, Alloc>::contiguous_storage(const Alloc& alloc)
@@ -191,20 +181,6 @@ _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::deallocate() noexcept
   } // end if
 } // end contiguous_storage::deallocate()
 
-_CCCL_EXEC_CHECK_DISABLE
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::swap(contiguous_storage& x)
-{
-  using ::cuda::std::swap;
-  swap(m_begin, x.m_begin);
-  swap(m_size, x.m_size);
-
-  // FIXME(bgruber): swap_allocators already swaps m_allocator, so we are swapping twice here !!
-  swap_allocators(integral_constant<bool, allocator_traits<Alloc>::propagate_on_container_swap::value>(),
-                  x.m_allocator);
-  swap(m_allocator, x.m_allocator);
-} // end contiguous_storage::swap()
-
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::value_initialize_n(iterator first, size_type n)
 {
@@ -262,24 +238,6 @@ _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy(iterator first, ite
   destroy_range(m_allocator, first.base(), last - first);
 } // end contiguous_storage::destroy()
 
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void
-contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch(const contiguous_storage& other) noexcept
-{
-  integral_constant<bool, allocator_traits<Alloc>::propagate_on_container_copy_assignment::value> c;
-
-  deallocate_on_allocator_mismatch_dispatch(c, other);
-} // end contiguous_storage::deallocate_on_allocator_mismatch
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy_on_allocator_mismatch(
-  const contiguous_storage& other, iterator first, iterator last) noexcept
-{
-  integral_constant<bool, allocator_traits<Alloc>::propagate_on_container_copy_assignment::value> c;
-
-  destroy_on_allocator_mismatch_dispatch(c, other, first, last);
-} // end contiguous_storage::destroy_on_allocator_mismatch
-
 _CCCL_EXEC_CHECK_DISABLE
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::set_allocator(const Alloc& alloc)
@@ -287,36 +245,7 @@ _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::set_allocator(const Alloc&
   m_allocator = alloc;
 } // end contiguous_storage::set_allocator()
 
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE bool contiguous_storage<T, Alloc>::is_allocator_not_equal(const Alloc& alloc) const
-{
-  return is_allocator_not_equal_dispatch(
-    integral_constant<bool, allocator_traits<Alloc>::is_always_equal::value>(), alloc);
-} // end contiguous_storage::is_allocator_not_equal()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE bool
-contiguous_storage<T, Alloc>::is_allocator_not_equal(const contiguous_storage<T, Alloc>& other) const
-{
-  return is_allocator_not_equal(m_allocator, other.m_allocator);
-} // end contiguous_storage::is_allocator_not_equal()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::propagate_allocator(const contiguous_storage& other)
-{
-  integral_constant<bool, allocator_traits<Alloc>::propagate_on_container_copy_assignment::value> c;
-
-  propagate_allocator_dispatch(c, other);
-} // end contiguous_storage::propagate_allocator()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::propagate_allocator(contiguous_storage& other)
-{
-  integral_constant<bool, allocator_traits<Alloc>::propagate_on_container_move_assignment::value> c;
-
-  propagate_allocator_dispatch(c, other);
-} // end contiguous_storage::propagate_allocator()
-
+_CCCL_EXEC_CHECK_DISABLE
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE contiguous_storage<T, Alloc>& contiguous_storage<T, Alloc>::operator=(contiguous_storage&& other)
 {
@@ -324,7 +253,11 @@ _CCCL_HOST_DEVICE contiguous_storage<T, Alloc>& contiguous_storage<T, Alloc>::op
   {
     deallocate();
   }
-  propagate_allocator(other);
+  _CCCL_IF_CONSTEXPR (allocator_traits<Alloc>::propagate_on_container_move_assignment::value)
+  {
+    m_allocator = ::cuda::std::move(other.m_allocator);
+  }
+
   m_begin = std::move(other.m_begin);
   m_size  = std::move(other.m_size);
 
@@ -332,96 +265,7 @@ _CCCL_HOST_DEVICE contiguous_storage<T, Alloc>& contiguous_storage<T, Alloc>::op
   other.m_size  = 0;
 
   return *this;
-} // end contiguous_storage::propagate_allocator()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::swap_allocators(true_type, const Alloc&)
-{} // end contiguous_storage::swap_allocators()
-
-_CCCL_EXEC_CHECK_DISABLE
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::swap_allocators(false_type, Alloc& other)
-{
-  // FIXME(bgruber): it is really concerning, that swapping an allocator can throw. swap() should be noexcept in
-  // general.
-  NV_IF_TARGET(NV_IS_DEVICE,
-               (
-                 // allocators must be equal when swapping containers with allocators that propagate on swap
-                 assert(!is_allocator_not_equal(other));),
-               (if (is_allocator_not_equal(other)) { throw allocator_mismatch_on_swap(); }));
-  using ::cuda::std::swap;
-  swap(m_allocator, other);
-} // end contiguous_storage::swap_allocators()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE bool
-contiguous_storage<T, Alloc>::is_allocator_not_equal_dispatch(true_type /*is_always_equal*/, const Alloc&) const
-{
-  return false;
-} // end contiguous_storage::is_allocator_not_equal_dispatch()
-
-_CCCL_EXEC_CHECK_DISABLE
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE bool
-contiguous_storage<T, Alloc>::is_allocator_not_equal_dispatch(false_type /*!is_always_equal*/, const Alloc& other) const
-{
-  return m_allocator != other;
-} // end contiguous_storage::is_allocator_not_equal_dispatch()
-
-_CCCL_EXEC_CHECK_DISABLE
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch_dispatch(
-  true_type, const contiguous_storage& other) noexcept
-{
-  if (m_allocator != other.m_allocator)
-  {
-    deallocate();
-  }
-} // end contiguous_storage::deallocate_on_allocator_mismatch()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void
-contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage&) noexcept
-{} // end contiguous_storage::deallocate_on_allocator_mismatch()
-
-_CCCL_EXEC_CHECK_DISABLE
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy_on_allocator_mismatch_dispatch(
-  true_type, const contiguous_storage& other, iterator first, iterator last) noexcept
-{
-  if (m_allocator != other.m_allocator)
-  {
-    destroy(first, last);
-  }
-} // end contiguous_storage::destroy_on_allocator_mismatch()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy_on_allocator_mismatch_dispatch(
-  false_type, const contiguous_storage&, iterator, iterator) noexcept
-{} // end contiguous_storage::destroy_on_allocator_mismatch()
-
-_CCCL_EXEC_CHECK_DISABLE
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void
-contiguous_storage<T, Alloc>::propagate_allocator_dispatch(true_type, const contiguous_storage& other)
-{
-  m_allocator = other.m_allocator;
-} // end contiguous_storage::propagate_allocator()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::propagate_allocator_dispatch(false_type, const contiguous_storage&)
-{} // end contiguous_storage::propagate_allocator()
-
-_CCCL_EXEC_CHECK_DISABLE
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::propagate_allocator_dispatch(true_type, contiguous_storage& other)
-{
-  m_allocator = std::move(other.m_allocator);
-} // end contiguous_storage::propagate_allocator()
-
-template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::propagate_allocator_dispatch(false_type, contiguous_storage&)
-{} // end contiguous_storage::propagate_allocator()
+}
 
 } // namespace detail
 
diff --git a/thrust/thrust/detail/execute_with_dependencies.h b/thrust/thrust/detail/execute_with_dependencies.h
index 4d9ead51c9b..8056f1bd63f 100644
--- a/thrust/thrust/detail/execute_with_dependencies.h
+++ b/thrust/thrust/detail/execute_with_dependencies.h
@@ -26,7 +26,8 @@
 #  pragma system_header
 #endif // no system header
 #include <thrust/detail/type_deduction.h>
-#include <thrust/type_traits/remove_cvref.h>
+
+#include <cuda/std/type_traits>
 
 #include <tuple>
 #include <type_traits>
@@ -56,7 +57,7 @@ auto capture_as_dependency(Dependency&& dependency) THRUST_DECLTYPE_RETURNS(THRU
 private:
   using super_t = BaseSystem<execute_with_dependencies<BaseSystem, Dependencies...>>;
 
-  std::tuple<remove_cvref_t<Dependencies>...> dependencies;
+  std::tuple<::cuda::std::remove_cvref_t<Dependencies>...> dependencies;
 
 public:
   _CCCL_HOST execute_with_dependencies(super_t const& super, Dependencies&&... deps)
@@ -86,7 +87,7 @@ auto capture_as_dependency(Dependency&& dependency) THRUST_DECLTYPE_RETURNS(THRU
       : dependencies(std::move(deps))
   {}
 
-  std::tuple<remove_cvref_t<Dependencies>...> _CCCL_HOST extract_dependencies()
+  std::tuple<::cuda::std::remove_cvref_t<Dependencies>...> _CCCL_HOST extract_dependencies()
   {
     return std::move(dependencies);
   }
@@ -120,7 +121,7 @@ struct execute_with_allocator_and_dependencies
 private:
   using super_t = BaseSystem<execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>>;
 
-  std::tuple<remove_cvref_t<Dependencies>...> dependencies;
+  std::tuple<::cuda::std::remove_cvref_t<Dependencies>...> dependencies;
   Allocator alloc;
 
 public:
@@ -151,7 +152,7 @@ struct execute_with_allocator_and_dependencies
       , alloc(a)
   {}
 
-  std::tuple<remove_cvref_t<Dependencies>...> _CCCL_HOST extract_dependencies()
+  std::tuple<::cuda::std::remove_cvref_t<Dependencies>...> _CCCL_HOST extract_dependencies()
   {
     return std::move(dependencies);
   }
@@ -185,26 +186,26 @@ struct execute_with_allocator_and_dependencies
 };
 
 template <template <typename> class BaseSystem, typename... Dependencies>
-_CCCL_HOST std::tuple<remove_cvref_t<Dependencies>...>
+_CCCL_HOST std::tuple<::cuda::std::remove_cvref_t<Dependencies>...>
 extract_dependencies(thrust::detail::execute_with_dependencies<BaseSystem, Dependencies...>&& system)
 {
   return std::move(system).extract_dependencies();
 }
 template <template <typename> class BaseSystem, typename... Dependencies>
-_CCCL_HOST std::tuple<remove_cvref_t<Dependencies>...>
+_CCCL_HOST std::tuple<::cuda::std::remove_cvref_t<Dependencies>...>
 extract_dependencies(thrust::detail::execute_with_dependencies<BaseSystem, Dependencies...>& system)
 {
   return std::move(system).extract_dependencies();
 }
 
 template <typename Allocator, template <typename> class BaseSystem, typename... Dependencies>
-_CCCL_HOST std::tuple<remove_cvref_t<Dependencies>...> extract_dependencies(
+_CCCL_HOST std::tuple<::cuda::std::remove_cvref_t<Dependencies>...> extract_dependencies(
   thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>&& system)
 {
   return std::move(system).extract_dependencies();
 }
 template <typename Allocator, template <typename> class BaseSystem, typename... Dependencies>
-_CCCL_HOST std::tuple<remove_cvref_t<Dependencies>...> extract_dependencies(
+_CCCL_HOST std::tuple<::cuda::std::remove_cvref_t<Dependencies>...> extract_dependencies(
   thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system)
 {
   return std::move(system).extract_dependencies();
diff --git a/thrust/thrust/detail/integer_traits.h b/thrust/thrust/detail/integer_traits.h
deleted file mode 100644
index 08a79194555..00000000000
--- a/thrust/thrust/detail/integer_traits.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <limits>
-
-#include <limits.h>
-
-THRUST_NAMESPACE_BEGIN
-
-namespace detail
-{
-
-template <typename T>
-class integer_traits
-{
-public:
-  static constexpr bool is_integral = false;
-};
-
-template <typename T, T min_val, T max_val>
-class integer_traits_base
-{
-public:
-  static constexpr bool is_integral = true;
-  static constexpr T const_min      = min_val;
-  static constexpr T const_max      = max_val;
-};
-
-template <>
-class integer_traits<bool>
-    : public std::numeric_limits<bool>
-    , public integer_traits_base<bool, false, true>
-{};
-
-template <>
-class integer_traits<char>
-    : public std::numeric_limits<char>
-    , public integer_traits_base<char, CHAR_MIN, CHAR_MAX>
-{};
-
-template <>
-class integer_traits<signed char>
-    : public std::numeric_limits<signed char>
-    , public integer_traits_base<signed char, SCHAR_MIN, SCHAR_MAX>
-{};
-
-template <>
-class integer_traits<unsigned char>
-    : public std::numeric_limits<unsigned char>
-    , public integer_traits_base<unsigned char, 0, UCHAR_MAX>
-{};
-
-template <>
-class integer_traits<short>
-    : public std::numeric_limits<short>
-    , public integer_traits_base<short, SHRT_MIN, SHRT_MAX>
-{};
-
-template <>
-class integer_traits<unsigned short>
-    : public std::numeric_limits<unsigned short>
-    , public integer_traits_base<unsigned short, 0, USHRT_MAX>
-{};
-
-template <>
-class integer_traits<int>
-    : public std::numeric_limits<int>
-    , public integer_traits_base<int, INT_MIN, INT_MAX>
-{};
-
-template <>
-class integer_traits<unsigned int>
-    : public std::numeric_limits<unsigned int>
-    , public integer_traits_base<unsigned int, 0, UINT_MAX>
-{};
-
-template <>
-class integer_traits<long>
-    : public std::numeric_limits<long>
-    , public integer_traits_base<long, LONG_MIN, LONG_MAX>
-{};
-
-template <>
-class integer_traits<unsigned long>
-    : public std::numeric_limits<unsigned long>
-    , public integer_traits_base<unsigned long, 0, ULONG_MAX>
-{};
-
-template <>
-class integer_traits<long long>
-    : public std::numeric_limits<long long>
-    , public integer_traits_base<long long, LLONG_MIN, LLONG_MAX>
-{};
-
-template <>
-class integer_traits<unsigned long long>
-    : public std::numeric_limits<unsigned long long>
-    , public integer_traits_base<unsigned long long, 0, ULLONG_MAX>
-{};
-
-} // namespace detail
-
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/minmax.h b/thrust/thrust/detail/minmax.h
deleted file mode 100644
index ff467f5d835..00000000000
--- a/thrust/thrust/detail/minmax.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-THRUST_NAMESPACE_BEGIN
-
-template <typename T, typename BinaryPredicate>
-_CCCL_HOST_DEVICE T min THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs, BinaryPredicate comp)
-{
-  return comp(rhs, lhs) ? rhs : lhs;
-} // end min()
-
-template <typename T>
-_CCCL_HOST_DEVICE T min THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs)
-{
-  return rhs < lhs ? rhs : lhs;
-} // end min()
-
-template <typename T, typename BinaryPredicate>
-_CCCL_HOST_DEVICE T max THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs, BinaryPredicate comp)
-{
-  return comp(lhs, rhs) ? rhs : lhs;
-} // end max()
-
-template <typename T>
-_CCCL_HOST_DEVICE T max THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs)
-{
-  return lhs < rhs ? rhs : lhs;
-} // end max()
-
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/pointer.h b/thrust/thrust/detail/pointer.h
index 622da52a124..b9f2fabc244 100644
--- a/thrust/thrust/detail/pointer.h
+++ b/thrust/thrust/detail/pointer.h
@@ -36,7 +36,8 @@
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/iterator/detail/iterator_traversal_tags.h>
 #include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/type_traits/remove_cvref.h>
+
+#include <cuda/std/type_traits>
 
 #include <cstddef>
 #include <ostream>
@@ -88,10 +89,9 @@ struct pointer_base
 {
   // void pointers should have no element type
   // note that we remove_cv from the Element type to get the value_type
-  using value_type =
-    typename thrust::detail::eval_if<::cuda::std::is_void<typename thrust::remove_cvref<Element>::type>::value,
-                                     thrust::detail::identity_<void>,
-                                     ::cuda::std::remove_cv<Element>>::type;
+  using value_type = typename thrust::detail::eval_if<::cuda::std::is_void<::cuda::std::remove_cvref_t<Element>>::value,
+                                                      thrust::detail::identity_<void>,
+                                                      ::cuda::std::remove_cv<Element>>::type;
 
   // if no Derived type is given, just use pointer
   using derived_type =
@@ -102,7 +102,7 @@ struct pointer_base
   // void pointers should have no reference type
   // if no Reference type is given, just use reference
   using reference_type = typename thrust::detail::eval_if<
-    ::cuda::std::is_void<typename thrust::remove_cvref<Element>::type>::value,
+    ::cuda::std::is_void<::cuda::std::remove_cvref_t<Element>>::value,
     thrust::detail::identity_<void>,
     thrust::detail::eval_if<::cuda::std::is_same<Reference, use_default>::value,
                             thrust::detail::identity_<reference<Element, derived_type>>,
diff --git a/thrust/thrust/detail/reference.h b/thrust/thrust/detail/reference.h
index 9685f141e40..e7480d2919e 100644
--- a/thrust/thrust/detail/reference.h
+++ b/thrust/thrust/detail/reference.h
@@ -37,10 +37,10 @@
 #include <thrust/system/detail/adl/iter_swap.h>
 #include <thrust/system/detail/generic/memory.h>
 #include <thrust/system/detail/generic/select_system.h>
-#include <thrust/type_traits/remove_cvref.h>
+
+#include <cuda/std/type_traits>
 
 #include <ostream>
-#include <type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -61,7 +61,7 @@ class reference
 
 public:
   using pointer    = Pointer;
-  using value_type = typename thrust::remove_cvref<Element>::type;
+  using value_type = ::cuda::std::remove_cvref_t<Element>;
 
   reference(reference const&) = default;
 
diff --git a/thrust/thrust/detail/select_system.h b/thrust/thrust/detail/select_system.h
index 65f4593cc7a..8cc07e26e64 100644
--- a/thrust/thrust/detail/select_system.h
+++ b/thrust/thrust/detail/select_system.h
@@ -27,7 +27,8 @@
 #endif // no system header
 #include <thrust/detail/type_deduction.h>
 #include <thrust/system/detail/generic/select_system.h>
-#include <thrust/type_traits/remove_cvref.h>
+
+#include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
diff --git a/thrust/thrust/detail/vector_base.inl b/thrust/thrust/detail/vector_base.inl
index 4597f69a047..d66765775ce 100644
--- a/thrust/thrust/detail/vector_base.inl
+++ b/thrust/thrust/detail/vector_base.inl
@@ -27,7 +27,6 @@
 #endif // no system header
 #include <thrust/advance.h>
 #include <thrust/detail/copy.h>
-#include <thrust/detail/minmax.h>
 #include <thrust/detail/overlapped_copy.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits.h>
@@ -36,6 +35,9 @@
 #include <thrust/equal.h>
 #include <thrust/iterator/iterator_traits.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 #include <stdexcept>
 
 THRUST_NAMESPACE_BEGIN
@@ -348,7 +350,7 @@ void vector_base<T, Alloc>::reserve(size_type n)
     size_type new_capacity = n;
 
     // do not exceed maximum storage
-    new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION<size_type>(new_capacity, max_size());
+    new_capacity = ::cuda::std::min<size_type>(new_capacity, max_size());
 
     // create new storage
     storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
@@ -726,13 +728,14 @@ void vector_base<T, Alloc>::copy_insert(iterator position, ForwardIterator first
       const size_type old_size = size();
 
       // compute the new capacity after the allocation
-      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(old_size, num_new_elements);
+      size_type new_capacity =
+        old_size + ::cuda::std::max THRUST_PREVENT_MACRO_SUBSTITUTION(old_size, num_new_elements);
 
       // allocate exponentially larger new storage
-      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION<size_type>(new_capacity, 2 * capacity());
+      new_capacity = ::cuda::std::max<size_type>(new_capacity, 2 * capacity());
 
       // do not exceed maximum storage
-      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION<size_type>(new_capacity, max_size());
+      new_capacity = ::cuda::std::min<size_type>(new_capacity, max_size());
 
       if (new_capacity > max_size())
       {
@@ -797,13 +800,13 @@ void vector_base<T, Alloc>::append(size_type n)
       const size_type old_size = size();
 
       // compute the new capacity after the allocation
-      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(old_size, n);
+      size_type new_capacity = old_size + ::cuda::std::max THRUST_PREVENT_MACRO_SUBSTITUTION(old_size, n);
 
       // allocate exponentially larger new storage
-      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION<size_type>(new_capacity, 2 * capacity());
+      new_capacity = ::cuda::std::max<size_type>(new_capacity, 2 * capacity());
 
       // do not exceed maximum storage
-      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION<size_type>(new_capacity, max_size());
+      new_capacity = ::cuda::std::min<size_type>(new_capacity, max_size());
 
       // create new storage
       storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
@@ -892,13 +895,13 @@ void vector_base<T, Alloc>::fill_insert(iterator position, size_type n, const T&
       const size_type old_size = size();
 
       // compute the new capacity after the allocation
-      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION(old_size, n);
+      size_type new_capacity = old_size + ::cuda::std::max THRUST_PREVENT_MACRO_SUBSTITUTION(old_size, n);
 
       // allocate exponentially larger new storage
-      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION<size_type>(new_capacity, 2 * capacity());
+      new_capacity = ::cuda::std::max<size_type>(new_capacity, 2 * capacity());
 
       // do not exceed maximum storage
-      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION<size_type>(new_capacity, max_size());
+      new_capacity = ::cuda::std::min<size_type>(new_capacity, max_size());
 
       if (new_capacity > max_size())
       {
@@ -1072,10 +1075,10 @@ void vector_base<T, Alloc>::allocate_and_copy(
   } // end if
 
   // allocate exponentially larger new storage
-  size_type allocated_size = thrust::max<size_type>(requested_size, 2 * capacity());
+  size_type allocated_size = ::cuda::std::max<size_type>(requested_size, 2 * capacity());
 
   // do not exceed maximum storage
-  allocated_size = thrust::min<size_type>(allocated_size, max_size());
+  allocated_size = ::cuda::std::min<size_type>(allocated_size, max_size());
 
   if (requested_size > allocated_size)
   {
diff --git a/thrust/thrust/extrema.h b/thrust/thrust/extrema.h
index 72b3dcf728d..14c37bb70d3 100644
--- a/thrust/thrust/extrema.h
+++ b/thrust/thrust/extrema.h
@@ -32,155 +32,13 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-THRUST_NAMESPACE_BEGIN
-
-/*! This version of \p min returns the smaller of two values, given a comparison operation.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \param comp A comparison operation.
- *  \return The smaller element.
- *
- *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a
- * href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>.
- *
- *  The following code snippet demonstrates how to use \p min to compute the smaller of two
- *  key-value objects.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value a = {13, 0};
- *  key_value b = { 7, 1);
- *
- *  key_value smaller = thrust::min(a, b, compare_key_value());
- *
- *  // smaller is {7, 1}
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see max
- */
-template <typename T, typename BinaryPredicate>
-_CCCL_HOST_DEVICE T min THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs, BinaryPredicate comp);
-
-/*! This version of \p min returns the smaller of two values.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \return The smaller element.
- *
- *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan
- * Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p min to compute the smaller of two
- *  integers.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  int a = 13;
- *  int b = 7;
- *
- *  int smaller = thrust::min(a, b);
- *
- *  // smaller is 7
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see max
- */
-template <typename T>
-_CCCL_HOST_DEVICE T min THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs);
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 
-/*! This version of \p max returns the larger of two values, given a comparison operation.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \param comp A comparison operation.
- *  \return The larger element.
- *
- *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a
- * href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>.
- *
- *  The following code snippet demonstrates how to use \p max to compute the larger of two
- *  key-value objects.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value a = {13, 0};
- *  key_value b = { 7, 1);
- *
- *  key_value larger = thrust::max(a, b, compare_key_value());
- *
- *  // larger is {13, 0}
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see min
- */
-template <typename T, typename BinaryPredicate>
-_CCCL_HOST_DEVICE T max THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs, BinaryPredicate comp);
+THRUST_NAMESPACE_BEGIN
 
-/*! This version of \p max returns the larger of two values.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \return The larger element.
- *
- *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan
- * Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p max to compute the larger of two
- *  integers.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  int a = 13;
- *  int b = 7;
- *
- *  int larger = thrust::min(a, b);
- *
- *  // larger is 13
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see min
- */
-template <typename T>
-_CCCL_HOST_DEVICE T max THRUST_PREVENT_MACRO_SUBSTITUTION(const T& lhs, const T& rhs);
+using ::cuda::std::max;
+using ::cuda::std::min;
 
 /*! \addtogroup reductions
  *  \{
@@ -797,4 +655,3 @@ minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp
 THRUST_NAMESPACE_END
 
 #include <thrust/detail/extrema.inl>
-#include <thrust/detail/minmax.h>
diff --git a/thrust/thrust/iterator/detail/transform_iterator.inl b/thrust/thrust/iterator/detail/transform_iterator.inl
index 720fdc34330..68145e65c50 100644
--- a/thrust/thrust/iterator/detail/transform_iterator.inl
+++ b/thrust/thrust/iterator/detail/transform_iterator.inl
@@ -28,7 +28,8 @@
 #include <thrust/detail/type_traits/result_of_adaptable_function.h>
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/type_traits/remove_cvref.h>
+
+#include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -51,7 +52,7 @@ private:
 
   // By default, dereferencing the iterator yields the same as the function.
   using reference  = typename ia_dflt_help<Reference, wrapped_func_ret_t>::type;
-  using value_type = typename ia_dflt_help<Value, remove_cvref<reference>>::type;
+  using value_type = typename ia_dflt_help<Value, ::cuda::std::remove_cvref<reference>>::type;
 
 public:
   using type =
diff --git a/thrust/thrust/iterator/detail/universal_categories.h b/thrust/thrust/iterator/detail/universal_categories.h
deleted file mode 100644
index ea30f076beb..00000000000
--- a/thrust/thrust/iterator/detail/universal_categories.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-#include <thrust/iterator/iterator_categories.h>
-
-// XXX eliminate this file
-
-THRUST_NAMESPACE_BEGIN
-
-// define these types without inheritance to avoid ambiguous conversion to base classes
-
-struct input_universal_iterator_tag
-{
-  operator input_host_iterator_tag()
-  {
-    return input_host_iterator_tag();
-  }
-
-  operator input_device_iterator_tag()
-  {
-    return input_device_iterator_tag();
-  }
-};
-
-struct output_universal_iterator_tag
-{
-  operator output_host_iterator_tag()
-  {
-    return output_host_iterator_tag();
-  }
-
-  operator output_device_iterator_tag()
-  {
-    return output_device_iterator_tag();
-  }
-};
-
-struct forward_universal_iterator_tag : input_universal_iterator_tag
-{
-  operator forward_host_iterator_tag()
-  {
-    return forward_host_iterator_tag();
-  };
-
-  operator forward_device_iterator_tag()
-  {
-    return forward_device_iterator_tag();
-  };
-};
-
-struct bidirectional_universal_iterator_tag : forward_universal_iterator_tag
-{
-  operator bidirectional_host_iterator_tag()
-  {
-    return bidirectional_host_iterator_tag();
-  };
-
-  operator bidirectional_device_iterator_tag()
-  {
-    return bidirectional_device_iterator_tag();
-  };
-};
-
-namespace detail
-{
-
-// create this struct to control conversion precedence in random_access_universal_iterator_tag
-template <typename T>
-struct one_degree_of_separation : T
-{};
-
-} // namespace detail
-
-struct random_access_universal_iterator_tag
-{
-  // these conversions are all P0
-  operator random_access_host_iterator_tag()
-  {
-    return random_access_host_iterator_tag();
-  };
-
-  operator random_access_device_iterator_tag()
-  {
-    return random_access_device_iterator_tag();
-  };
-
-  // bidirectional_universal_iterator_tag is P1
-  operator detail::one_degree_of_separation<bidirectional_universal_iterator_tag>()
-  {
-    return detail::one_degree_of_separation<bidirectional_universal_iterator_tag>();
-  }
-};
-
-THRUST_NAMESPACE_END
diff --git a/thrust/thrust/iterator/iterator_categories.h b/thrust/thrust/iterator/iterator_categories.h
index d39f85287e7..5fbe90ab2c3 100644
--- a/thrust/thrust/iterator/iterator_categories.h
+++ b/thrust/thrust/iterator/iterator_categories.h
@@ -214,5 +214,3 @@ using random_access_host_iterator_tag = std::random_access_iterator_tag;
  */
 
 THRUST_NAMESPACE_END
-
-#include <thrust/iterator/detail/universal_categories.h>
diff --git a/thrust/thrust/iterator/transform_iterator.h b/thrust/thrust/iterator/transform_iterator.h
index 0fd040685c6..736678bce12 100644
--- a/thrust/thrust/iterator/transform_iterator.h
+++ b/thrust/thrust/iterator/transform_iterator.h
@@ -198,7 +198,6 @@ class transform_iterator
   /*! \endcond
    */
 
-public:
   /*! Null constructor does nothing.
    */
   transform_iterator() = default;
@@ -239,20 +238,7 @@ class transform_iterator
       , m_f(other.functor())
   {}
 
-  /*! Copy assignment operator copies from another \p transform_iterator.
-   *  \p other The other \p transform_iterator to copy
-   *  \return <tt>*this</tt>
-   *
-   *  \note If the type of this \p transform_iterator's functor is not copy assignable
-   *        (for example, if it is a lambda) it is not an error to call this function.
-   *        In this case, however, the functor will not be modified.
-   *
-   *        In any case, this \p transform_iterator's underlying iterator will be copy assigned.
-   */
-  _CCCL_HOST_DEVICE transform_iterator& operator=(const transform_iterator& other)
-  {
-    return do_assign(other, ::cuda::std::is_copy_assignable<AdaptableUnaryFunction>());
-  }
+  transform_iterator& operator=(const transform_iterator&) = default;
 
   /*! This method returns a copy of this \p transform_iterator's \c AdaptableUnaryFunction.
    *  \return A copy of this \p transform_iterator's \c AdaptableUnaryFunction.
@@ -266,25 +252,6 @@ class transform_iterator
    */
 
 private:
-  _CCCL_HOST_DEVICE transform_iterator& do_assign(const transform_iterator& other, thrust::detail::true_type)
-  {
-    super_t::operator=(other);
-
-    // do assign to m_f
-    m_f = other.functor();
-
-    return *this;
-  }
-
-  _CCCL_HOST_DEVICE transform_iterator& do_assign(const transform_iterator& other, thrust::detail::false_type)
-  {
-    super_t::operator=(other);
-
-    // don't assign to m_f
-
-    return *this;
-  }
-
 // MSVC 2013 and 2015 incorrectly warning about returning a reference to
 // a local/temporary here.
 // See goo.gl/LELTNp
diff --git a/thrust/thrust/limits.h b/thrust/thrust/limits.h
index b6ea9c0bdf9..54cad7fd1cf 100644
--- a/thrust/thrust/limits.h
+++ b/thrust/thrust/limits.h
@@ -24,7 +24,7 @@
 THRUST_NAMESPACE_BEGIN
 
 template <typename T>
-struct numeric_limits : std::numeric_limits<T>
+struct CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits") numeric_limits : std::numeric_limits<T>
 {};
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h
index 41092a850e3..33da23ec10b 100644
--- a/thrust/thrust/optional.h
+++ b/thrust/thrust/optional.h
@@ -37,6 +37,8 @@
 #include <type_traits>
 #include <utility>
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 #if _CCCL_COMPILER(MSVC, ==, 19, 00)
 #  define THRUST_OPTIONAL_MSVC2015
 #endif
@@ -59,11 +61,11 @@ THRUST_NAMESPACE_BEGIN
 #ifndef THRUST_MONOSTATE_INPLACE_MUTEX
 #  define THRUST_MONOSTATE_INPLACE_MUTEX
 /// \brief Used to represent an optional with no data; essentially a bool
-class monostate
+class CCCL_DEPRECATED_BECAUSE("Use cuda::std::monostate instead") monostate
 {};
 
 /// \brief A tag type to tell optional to construct its value in-place
-struct in_place_t
+struct CCCL_DEPRECATED in_place_t
 {
   explicit in_place_t() = default;
 };
@@ -72,7 +74,7 @@ static constexpr in_place_t in_place{};
 #endif
 
 template <class T>
-class optional;
+class CCCL_DEPRECATED_BECAUSE("Use cuda::std::optional") optional;
 
 /// \exclude
 namespace detail
@@ -722,7 +724,7 @@ struct optional_delete_assign_base<T, false, false>
 } // namespace detail
 
 /// \brief A tag type to represent an empty optional
-struct nullopt_t
+struct CCCL_DEPRECATED nullopt_t
 {
   struct do_not_use
   {};
@@ -744,7 +746,7 @@ static constexpr
 #endif // __CUDA_ARCH__
   nullopt_t nullopt{nullopt_t::do_not_use{}, nullopt_t::do_not_use{}};
 
-class bad_optional_access : public std::exception
+class CCCL_DEPRECATED bad_optional_access : public std::exception
 {
 public:
   bad_optional_access() = default;
@@ -1954,6 +1956,7 @@ _CCCL_EXEC_CHECK_DISABLE
 template <class T = detail::i_am_secret,
           class U,
           class Ret = detail::conditional_t<std::is_same<T, detail::i_am_secret>::value, detail::decay_t<U>, T>>
+CCCL_DEPRECATED_BECAUSE("Use cuda::std::make_optional")
 _CCCL_HOST_DEVICE inline constexpr optional<Ret> make_optional(U&& v)
 {
   return optional<Ret>(std::forward<U>(v));
@@ -1961,12 +1964,14 @@ _CCCL_HOST_DEVICE inline constexpr optional<Ret> make_optional(U&& v)
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class T, class... Args>
+CCCL_DEPRECATED_BECAUSE("Use cuda::std::make_optional")
 _CCCL_HOST_DEVICE inline constexpr optional<T> make_optional(Args&&... args)
 {
   return optional<T>(in_place, std::forward<Args>(args)...);
 }
 _CCCL_EXEC_CHECK_DISABLE
 template <class T, class U, class... Args>
+CCCL_DEPRECATED_BECAUSE("Use cuda::std::make_optional")
 _CCCL_HOST_DEVICE inline constexpr optional<T> make_optional(std::initializer_list<U> il, Args&&... args)
 {
   return optional<T>(in_place, il, std::forward<Args>(args)...);
@@ -2003,7 +2008,22 @@ _CCCL_HOST_DEVICE auto optional_map_impl(Opt&& opt, F&& f)
   if (opt.has_value())
   {
     detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt));
+#    if _CCCL_COMPILER(MSVC)
+    // MSVC fails to suppress the warning on make_optional
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
+    return optional<monostate>(monostate{});
+    _CCCL_SUPPRESS_DEPRECATED_POP
+#    elif _CCCL_COMPILER(NVHPC)
+    // NVHPC cannot have a diagnostic pop after a return statement
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
+    auto o = optional<monostate>(monostate{});
+    _CCCL_SUPPRESS_DEPRECATED_POP
+    return ::cuda::std::move(o);
+#    else
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
     return make_optional(monostate{});
+    _CCCL_SUPPRESS_DEPRECATED_POP
+#    endif
   }
 
   return optional<monostate>(nullopt);
@@ -2822,3 +2842,5 @@ struct hash<THRUST_NS_QUALIFIER::optional<T>>
   }
 };
 } // namespace std
+
+_CCCL_SUPPRESS_DEPRECATED_POP
diff --git a/thrust/thrust/random/detail/normal_distribution.inl b/thrust/thrust/random/detail/normal_distribution.inl
index 9713a1ccd20..44032d9d43f 100644
--- a/thrust/thrust/random/detail/normal_distribution.inl
+++ b/thrust/thrust/random/detail/normal_distribution.inl
@@ -26,7 +26,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <thrust/detail/integer_traits.h>
 #include <thrust/random/normal_distribution.h>
 #include <thrust/random/uniform_real_distribution.h>
 
diff --git a/thrust/thrust/random/uniform_int_distribution.h b/thrust/thrust/random/uniform_int_distribution.h
index fe69a0f43fc..aed01ec4967 100644
--- a/thrust/thrust/random/uniform_int_distribution.h
+++ b/thrust/thrust/random/uniform_int_distribution.h
@@ -29,10 +29,11 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/integer_traits.h>
 #include <thrust/pair.h>
 #include <thrust/random/detail/random_core_access.h>
 
+#include <cuda/std/limits>
+
 #include <iostream>
 
 THRUST_NAMESPACE_BEGIN
@@ -117,7 +118,7 @@ class uniform_int_distribution
    *           the platform.
    */
   _CCCL_HOST_DEVICE explicit uniform_int_distribution(
-    IntType a = 0, IntType b = THRUST_NS_QUALIFIER::detail::integer_traits<IntType>::const_max);
+    IntType a = 0, IntType b = ::cuda::std::numeric_limits<IntType>::max());
 
   /*! This constructor creates a new \p uniform_int_distribution from a \p param_type object
    *  encapsulating the range of the distribution.
diff --git a/thrust/thrust/system/cuda/detail/adjacent_difference.h b/thrust/thrust/system/cuda/detail/adjacent_difference.h
index 043e1a571ad..268c67eb03c 100644
--- a/thrust/thrust/system/cuda/detail/adjacent_difference.h
+++ b/thrust/thrust/system/cuda/detail/adjacent_difference.h
@@ -43,7 +43,6 @@
 #  include <cub/device/device_adjacent_difference.cuh>
 #  include <cub/util_math.cuh>
 
-#  include <thrust/detail/minmax.h>
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/detail/type_traits.h>
 #  include <thrust/functional.h>
@@ -52,7 +51,8 @@
 #  include <thrust/system/cuda/detail/par_to_seq.h>
 #  include <thrust/system/cuda/detail/util.h>
 #  include <thrust/type_traits/is_contiguous_iterator.h>
-#  include <thrust/type_traits/remove_cvref.h>
+
+#  include <cuda/std/type_traits>
 
 #  include <cstdint>
 
diff --git a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
index 19758e28405..264a1e4e253 100644
--- a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -48,7 +48,8 @@
 #    include <thrust/system/cuda/detail/async/customization.h>
 #    include <thrust/system/cuda/detail/util.h>
 #    include <thrust/system/cuda/future.h>
-#    include <thrust/type_traits/remove_cvref.h>
+
+#    include <cuda/std/type_traits>
 
 #    include <type_traits>
 
@@ -72,8 +73,8 @@ unique_eager_event async_exclusive_scan_n(
   execution_policy<DerivedPolicy>& policy, ForwardIt first, Size n, OutputIt out, InitialValueType init, BinaryOp op)
 {
   using InputValueT = cub::detail::InputValue<InitialValueType>;
-  using Dispatch32  = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, InputValueT, std::int32_t, InitialValueType>;
-  using Dispatch64  = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, InputValueT, std::int64_t, InitialValueType>;
+  using Dispatch32  = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, InputValueT, std::uint32_t, InitialValueType>;
+  using Dispatch64  = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, InputValueT, std::uint64_t, InitialValueType>;
 
   InputValueT init_value(init);
 
@@ -84,7 +85,7 @@ unique_eager_event async_exclusive_scan_n(
   cudaError_t status;
   size_t tmp_size = 0;
   {
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
index bc7f9165a6c..7f5d9e80570 100644
--- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -48,7 +48,8 @@
 #    include <thrust/system/cuda/detail/async/customization.h>
 #    include <thrust/system/cuda/detail/util.h>
 #    include <thrust/system/cuda/future.h>
-#    include <thrust/type_traits/remove_cvref.h>
+
+#    include <cuda/std/type_traits>
 
 #    include <type_traits>
 
@@ -67,8 +68,8 @@ unique_eager_event
 async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy, ForwardIt first, Size n, OutputIt out, BinaryOp op)
 {
   using AccumT     = typename thrust::iterator_traits<ForwardIt>::value_type;
-  using Dispatch32 = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, cub::NullType, std::int32_t, AccumT>;
-  using Dispatch64 = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, cub::NullType, std::int64_t, AccumT>;
+  using Dispatch32 = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, cub::NullType, std::uint32_t, AccumT>;
+  using Dispatch64 = cub::DispatchScan<ForwardIt, OutputIt, BinaryOp, cub::NullType, std::uint64_t, AccumT>;
 
   auto const device_alloc = get_async_device_allocator(policy);
   unique_eager_event ev;
@@ -77,7 +78,7 @@ async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy, ForwardIt first,
   cudaError_t status;
   size_t tmp_size = 0;
   {
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
diff --git a/thrust/thrust/system/cuda/detail/async/reduce.h b/thrust/thrust/system/cuda/detail/async/reduce.h
index fc65efb0f9f..8f4b174c7e5 100644
--- a/thrust/thrust/system/cuda/detail/async/reduce.h
+++ b/thrust/thrust/system/cuda/detail/async/reduce.h
@@ -52,7 +52,8 @@
 #    include <thrust/system/cuda/detail/async/customization.h>
 #    include <thrust/system/cuda/detail/reduce.h>
 #    include <thrust/system/cuda/future.h>
-#    include <thrust/type_traits/remove_cvref.h>
+
+#    include <cuda/std/type_traits>
 
 #    include <type_traits>
 
@@ -66,10 +67,10 @@ namespace detail
 {
 
 template <typename DerivedPolicy, typename ForwardIt, typename Size, typename T, typename BinaryOp>
-unique_eager_future<remove_cvref_t<T>>
+unique_eager_future<::cuda::std::remove_cvref_t<T>>
 async_reduce_n(execution_policy<DerivedPolicy>& policy, ForwardIt first, Size n, T init, BinaryOp op)
 {
-  using U = remove_cvref_t<T>;
+  using U = ::cuda::std::remove_cvref_t<T>;
 
   auto const device_alloc = get_async_device_allocator(policy);
 
@@ -157,7 +158,7 @@ template <typename DerivedPolicy, typename ForwardIt, typename Size, typename Ou
 unique_eager_event async_reduce_into_n(
   execution_policy<DerivedPolicy>& policy, ForwardIt first, Size n, OutputIt output, T init, BinaryOp op)
 {
-  using U = remove_cvref_t<T>;
+  using U = ::cuda::std::remove_cvref_t<T>;
 
   auto const device_alloc = get_async_device_allocator(policy);
 
diff --git a/thrust/thrust/system/cuda/detail/async/sort.h b/thrust/thrust/system/cuda/detail/async/sort.h
index a37514cdf5a..6067e9105ae 100644
--- a/thrust/thrust/system/cuda/detail/async/sort.h
+++ b/thrust/thrust/system/cuda/detail/async/sort.h
@@ -87,7 +87,7 @@ auto async_stable_sort_n(execution_policy<DerivedPolicy>& policy, ForwardIt firs
 
   // Synthesize a suitable new execution policy, because we don't want to
   // try and extract twice from the one we were passed.
-  typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
+  typename ::cuda::std::remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
 
   // Copy from the input into the buffer.
 
@@ -290,7 +290,7 @@ auto async_stable_sort_n(execution_policy<DerivedPolicy>& policy, ForwardIt firs
 
     // Synthesize a suitable new execution policy, because we don't want to
     // try and extract twice from the one we were passed.
-    typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
+    typename ::cuda::std::remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
 
     using return_future = decltype(e);
     return return_future(async_copy_n(new_policy0, tag_policy, keys.d_buffers[1], n, keys.d_buffers[0]));
diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
index 218ee4ab771..fb7c1ef22d6 100644
--- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h
+++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
@@ -36,8 +36,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/device_synchronize.cuh>
-
 #if _CCCL_HAS_CUDA_COMPILER
 #  include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 #  include <thrust/system/cuda/detail/core/util.h>
diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h
index 46681423790..186990f4a0b 100644
--- a/thrust/thrust/system/cuda/detail/core/util.h
+++ b/thrust/thrust/system/cuda/detail/core/util.h
@@ -681,7 +681,7 @@ THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream)
 
 inline void _CCCL_DEVICE sync_threadblock()
 {
-  cub::CTA_SYNC();
+  __syncthreads();
 }
 
 #define CUDA_CUB_RET_IF_FAIL(e)                \
diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
index 3d004aa5531..c161d9c3cf1 100644
--- a/thrust/thrust/system/cuda/detail/dispatch.h
+++ b/thrust/thrust/system/cuda/detail/dispatch.h
@@ -27,10 +27,10 @@
 #endif // no system header
 
 #include <thrust/detail/integer_math.h>
-#include <thrust/detail/integer_traits.h>
 #include <thrust/detail/preprocessor.h>
 
 #include <cuda/std/detail/libcxx/include/stdexcept>
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <cstdint>
@@ -94,7 +94,7 @@
 //! @brief Ensures that the size of the input does not overflow the offset type
 #  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count)                       \
     if (static_cast<std::uint64_t>(count)                                                     \
-        > static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))  \
+        > static_cast<std::uint64_t>(::cuda::std::numeric_limits<index_type>::max()))         \
     {                                                                                         \
       ::cuda::std::__throw_runtime_error(                                                     \
         "Input size exceeds the maximum allowable value for " #index_type                     \
@@ -106,7 +106,7 @@
 //! @brief Ensures that the sizes of the inputs do not overflow the offset type, but two counts
 #  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW2(index_type, count1, count2)             \
     if (static_cast<std::uint64_t>(count1) + static_cast<std::uint64_t>(count2)               \
-        > static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))  \
+        > static_cast<std::uint64_t>(::cuda::std::numeric_limits<index_type>::max()))         \
     {                                                                                         \
       ::cuda::std::__throw_runtime_error(                                                     \
         "Input size exceeds the maximum allowable value for " #index_type                     \
@@ -148,12 +148,11 @@
 #else // ^^^ THRUST_FORCE_32_BIT_OFFSET_TYPE ^^^ / vvv !THRUST_FORCE_32_BIT_OFFSET_TYPE vvv
 
 #  define _THRUST_INDEX_TYPE_DISPATCH_SELECT(index_type, count) \
-    (static_cast<std::uint64_t>(count)                          \
-     <= static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))
+    (static_cast<std::uint64_t>(count) <= static_cast<std::uint64_t>(::cuda::std::numeric_limits<index_type>::max()))
 
 #  define _THRUST_INDEX_TYPE_DISPATCH_SELECT2(index_type, count1, count2)    \
     (static_cast<std::uint64_t>(count1) + static_cast<std::uint64_t>(count2) \
-     <= static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))
+     <= static_cast<std::uint64_t>(::cuda::std::numeric_limits<index_type>::max()))
 
 //! Dispatch between 32-bit and 64-bit index_type based versions of the same algorithm implementation. This version
 //! assumes that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style
diff --git a/thrust/thrust/system/cuda/detail/find.h b/thrust/thrust/system/cuda/detail/find.h
index de633c73ebb..bdce49ae640 100644
--- a/thrust/thrust/system/cuda/detail/find.h
+++ b/thrust/thrust/system/cuda/detail/find.h
@@ -39,7 +39,6 @@
 #if _CCCL_HAS_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
-#  include <thrust/detail/minmax.h>
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/execution_policy.h>
 
@@ -79,7 +78,7 @@ struct functor
     // select the smallest index among true results
     if (thrust::get<0>(lhs) && thrust::get<0>(rhs))
     {
-      return TupleType(true, (thrust::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+      return TupleType(true, (::cuda::std::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
     }
     else if (thrust::get<0>(lhs))
     {
@@ -113,7 +112,7 @@ find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Pred
 
   // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
   const Size interval_threshold = 1 << 20;
-  const Size interval_size      = (thrust::min)(interval_threshold, num_items);
+  const Size interval_size      = (::cuda::std::min)(interval_threshold, num_items);
 
   // force transform_iterator output to bool
   using XfrmIterator  = transform_input_iterator_t<bool, InputIt, Predicate>;
diff --git a/thrust/thrust/system/cuda/detail/future.inl b/thrust/thrust/system/cuda/detail/future.inl
index 48a91c3a802..da1f5a9624c 100644
--- a/thrust/thrust/system/cuda/detail/future.inl
+++ b/thrust/thrust/system/cuda/detail/future.inl
@@ -34,10 +34,10 @@
 #  include <thrust/system/cuda/future.h>
 #  include <thrust/system/cuda/memory.h>
 #  include <thrust/type_traits/integer_sequence.h>
-#  include <thrust/type_traits/remove_cvref.h>
 
 #  include <cuda/std/__memory/addressof.h>
 #  include <cuda/std/__memory/unique_ptr.h>
+#  include <cuda/std/type_traits>
 
 #  include <type_traits>
 
@@ -310,7 +310,9 @@ struct unique_eager_future_promise_pair final
   weak_promise<X, XPointer> promise;
 };
 
-struct acquired_stream final
+_CCCL_SUPPRESS_DEPRECATED_PUSH // for thrust::optional
+
+  struct acquired_stream final
 {
   unique_stream stream;
   optional<std::size_t> const acquired_from;
@@ -340,6 +342,8 @@ inline _CCCL_HOST optional<unique_stream> try_acquire_stream(int device, unique_
 template <typename X>
 _CCCL_HOST optional<unique_stream> try_acquire_stream(int device, unique_eager_future<X>& parent) noexcept;
 
+_CCCL_SUPPRESS_DEPRECATED_POP
+
 template <typename... Dependencies>
 _CCCL_HOST acquired_stream acquire_stream(int device, Dependencies&... deps) noexcept;
 
@@ -743,8 +747,10 @@ public:
     stream().wait();
   }
 
-  friend _CCCL_HOST optional<detail::unique_stream>
-  thrust::system::cuda::detail::try_acquire_stream(int device_id, unique_eager_event& parent) noexcept;
+  _CCCL_SUPPRESS_DEPRECATED_PUSH // for thrust::optional
+    friend _CCCL_HOST optional<detail::unique_stream>
+    thrust::system::cuda::detail::try_acquire_stream(int device_id, unique_eager_event& parent) noexcept;
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   template <typename... Dependencies>
   friend _CCCL_HOST unique_eager_event
@@ -754,7 +760,7 @@ public:
 template <typename T>
 struct unique_eager_future final
 {
-  THRUST_STATIC_ASSERT_MSG((!std::is_same<T, remove_cvref_t<void>>::value),
+  THRUST_STATIC_ASSERT_MSG((!std::is_same<T, ::cuda::std::remove_cvref_t<void>>::value),
                            "`thrust::event` should be used to express valueless futures");
 
   using value_type        = typename detail::async_value<T>::value_type;
@@ -901,9 +907,11 @@ public:
   }
 #  endif
 
-  template <typename X>
-  friend _CCCL_HOST optional<detail::unique_stream>
-  thrust::system::cuda::detail::try_acquire_stream(int device_id, unique_eager_future<X>& parent) noexcept;
+  _CCCL_SUPPRESS_DEPRECATED_PUSH // for thrust::optional
+    template <typename X>
+    friend _CCCL_HOST optional<detail::unique_stream>
+    thrust::system::cuda::detail::try_acquire_stream(int device_id, unique_eager_future<X>& parent) noexcept;
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   template <typename X, typename XPointer, typename ComputeContent, typename... Dependencies>
   friend _CCCL_HOST detail::unique_eager_future_promise_pair<X, XPointer>
@@ -916,9 +924,10 @@ public:
 
 namespace detail
 {
+_CCCL_SUPPRESS_DEPRECATED_PUSH // for thrust::optional
 
-template <typename X, typename Deleter>
-_CCCL_HOST optional<unique_stream> try_acquire_stream(int, std::unique_ptr<X, Deleter>&) noexcept
+  template <typename X, typename Deleter>
+  _CCCL_HOST optional<unique_stream> try_acquire_stream(int, std::unique_ptr<X, Deleter>&) noexcept
 {
   // There's no stream to acquire!
   return {};
@@ -973,6 +982,8 @@ _CCCL_HOST optional<unique_stream> try_acquire_stream(int device_id, unique_eage
   return {};
 }
 
+_CCCL_SUPPRESS_DEPRECATED_POP
+
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename... Dependencies>
@@ -987,7 +998,8 @@ template <typename... Dependencies, std::size_t I0, std::size_t... Is>
 _CCCL_HOST acquired_stream
 acquire_stream_impl(int device_id, std::tuple<Dependencies...>& deps, index_sequence<I0, Is...>) noexcept
 {
-  auto tr = try_acquire_stream(device_id, std::get<I0>(deps));
+  _CCCL_SUPPRESS_DEPRECATED_PUSH // for thrust::optional (MSVC warnings here)
+    auto tr = try_acquire_stream(device_id, std::get<I0>(deps));
 
   if (tr)
   {
@@ -997,6 +1009,7 @@ acquire_stream_impl(int device_id, std::tuple<Dependencies...>& deps, index_sequ
   {
     return acquire_stream_impl(device_id, deps, index_sequence<Is...>{});
   }
+  _CCCL_SUPPRESS_DEPRECATED_POP
 }
 
 template <typename... Dependencies>
@@ -1043,10 +1056,12 @@ create_dependencies_impl(acquired_stream& as, std::tuple<Dependencies...>& deps,
 {
   // We only need to wait on the current dependency if we didn't steal our
   // stream from it.
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
   if (!as.acquired_from || *as.acquired_from != I0)
   {
     create_dependency(as.stream, std::get<I0>(deps));
   }
+  _CCCL_SUPPRESS_DEPRECATED_POP
 
   create_dependencies_impl(as, deps, index_sequence<Is...>{});
 }
diff --git a/thrust/thrust/system/cuda/detail/inner_product.h b/thrust/thrust/system/cuda/detail/inner_product.h
index af41c5ccda8..d30d4f78042 100644
--- a/thrust/thrust/system/cuda/detail/inner_product.h
+++ b/thrust/thrust/system/cuda/detail/inner_product.h
@@ -37,7 +37,6 @@
 #endif // no system header
 
 #if _CCCL_HAS_CUDA_COMPILER
-#  include <thrust/detail/minmax.h>
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/reduce.h>
 
diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h
index 1d64270cc96..152381af039 100644
--- a/thrust/thrust/system/cuda/detail/partition.h
+++ b/thrust/thrust/system/cuda/detail/partition.h
@@ -230,7 +230,7 @@ THRUST_RUNTIME_FUNCTION pair<SelectedOutIt, RejectedOutIt> stable_partition_copy
     return thrust::make_pair(selected_result, rejected_result);
   }
 
-  using output_it_wrapper_t = cub::detail::partition_distinct_output_t<SelectedOutIt, RejectedOutIt>;
+  using output_it_wrapper_t = cub::detail::select::partition_distinct_output_t<SelectedOutIt, RejectedOutIt>;
   std::size_t num_items     = static_cast<std::size_t>(thrust::distance(first, last));
   std::size_t num_selected =
     partition(policy, first, last, stencil, output_it_wrapper_t{selected_result, rejected_result}, predicate);
diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h
index 9b9401e2891..443063fb3b4 100644
--- a/thrust/thrust/system/cuda/detail/reduce.h
+++ b/thrust/thrust/system/cuda/detail/reduce.h
@@ -44,7 +44,6 @@
 #  include <cub/util_math.cuh>
 
 #  include <thrust/detail/alignment.h>
-#  include <thrust/detail/minmax.h>
 #  include <thrust/detail/raw_reference_cast.h>
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/detail/type_traits/iterator/is_output_iterator.h>
@@ -647,7 +646,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
     // small, single tile size
     if (d_temp_storage == nullptr)
     {
-      temp_storage_bytes = max<size_t>(1, vshmem_size);
+      temp_storage_bytes = ::cuda::std::max<size_t>(1, vshmem_size);
       return status;
     }
     char* vshmem_ptr = vshmem_size > 0 ? (char*) d_temp_storage : nullptr;
@@ -717,7 +716,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
       // if not enough to fill the device with threadblocks
       // then fill the device with threadblocks
-      reduce_grid_size = static_cast<int>((min) (num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+      reduce_grid_size = static_cast<int>((::cuda::std::min)(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
 
       using drain_agent    = AgentLauncher<DrainAgent<Size>>;
       AgentPlan drain_plan = drain_agent::get_plan();
diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h
index 58aa33fd7e8..cc59c98ab2c 100644
--- a/thrust/thrust/system/cuda/detail/reduce_by_key.h
+++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h
@@ -44,7 +44,6 @@
 #  include <cub/util_math.cuh>
 
 #  include <thrust/detail/alignment.h>
-#  include <thrust/detail/minmax.h>
 #  include <thrust/detail/mpl/math.h>
 #  include <thrust/detail/raw_reference_cast.h>
 #  include <thrust/detail/temporary_array.h>
diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h
index 96cc914132a..890e02ba198 100644
--- a/thrust/thrust/system/cuda/detail/scan.h
+++ b/thrust/thrust/system/cuda/detail/scan.h
@@ -128,7 +128,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
                       OutputIt,
                       ScanOp,
                       InputValueT,
-                      std::int32_t,
+                      std::uint32_t,
                       AccumT,
                       cub::detail::scan::policy_hub<AccumT, ScanOp>,
                       ForceInclusive>;
@@ -137,7 +137,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
                       OutputIt,
                       ScanOp,
                       InputValueT,
-                      std::int64_t,
+                      std::uint64_t,
                       AccumT,
                       cub::detail::scan::policy_hub<AccumT, ScanOp>,
                       ForceInclusive>;
@@ -154,7 +154,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
   // Determine temporary storage requirements:
   size_t tmp_size = 0;
   {
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
@@ -170,7 +170,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
   {
     // Allocate temporary storage:
     thrust::detail::temporary_array<std::uint8_t, Derived> tmp{policy, tmp_size};
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
diff --git a/thrust/thrust/system/cuda/detail/scan_by_key.h b/thrust/thrust/system/cuda/detail/scan_by_key.h
index 3ffc7898dcb..1240783736c 100644
--- a/thrust/thrust/system/cuda/detail/scan_by_key.h
+++ b/thrust/thrust/system/cuda/detail/scan_by_key.h
@@ -43,7 +43,6 @@
 #  include <cub/device/dispatch/dispatch_scan_by_key.cuh>
 #  include <cub/util_type.cuh>
 
-#  include <thrust/detail/minmax.h>
 #  include <thrust/detail/mpl/math.h>
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/distance.h>
diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h
index 5f094377421..0ef80c0fb2d 100644
--- a/thrust/thrust/system/cuda/detail/set_operations.h
+++ b/thrust/thrust/system/cuda/detail/set_operations.h
@@ -52,6 +52,9 @@
 #  include <thrust/system/cuda/detail/par_to_seq.h>
 #  include <thrust/system/cuda/detail/util.h>
 
+#  include <cuda/std/__algorithm/max.h>
+#  include <cuda/std/__algorithm/min.h>
+
 #  include <cstdint>
 
 THRUST_NAMESPACE_BEGIN
@@ -127,8 +130,8 @@ THRUST_DEVICE_FUNCTION Size merge_path(It1 a, Size aCount, It2 b, Size bCount, S
 {
   using T = typename thrust::iterator_traits<It1>::value_type;
 
-  Size begin = thrust::max<Size>(0, diag - bCount);
-  Size end   = thrust::min<Size>(diag, aCount);
+  Size begin = ::cuda::std::max<Size>(0, diag - bCount);
+  Size end   = ::cuda::std::min<Size>(diag, aCount);
 
   while (begin < end)
   {
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
index fbb9913cb03..2c3ef85202d 100644
--- a/thrust/thrust/system/cuda/detail/sort.h
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -92,10 +92,19 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step(
   using ItemsInputIt = cub::NullType*;
   ItemsInputIt items = nullptr;
 
-  using DispatchMergeSortT = cub::DispatchMergeSort<KeysIt, ItemsInputIt, KeysIt, ItemsInputIt, Size, CompareOp>;
+  cudaError_t status = cudaSuccess;
 
-  return DispatchMergeSortT::Dispatch(
-    d_temp_storage, temp_storage_bytes, keys, items, keys, items, keys_count, compare_op, stream);
+  using dispatch32_t = cub::DispatchMergeSort<KeysIt, ItemsInputIt, KeysIt, ItemsInputIt, std::uint32_t, CompareOp>;
+  using dispatch64_t = cub::DispatchMergeSort<KeysIt, ItemsInputIt, KeysIt, ItemsInputIt, std::uint64_t, CompareOp>;
+
+  THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
+    status,
+    dispatch32_t::Dispatch,
+    dispatch64_t::Dispatch,
+    keys_count,
+    (d_temp_storage, temp_storage_bytes, keys, items, keys, items, keys_count_fixed, compare_op, stream));
+
+  return status;
 }
 
 template <class KeysIt, class ItemsIt, class Size, class CompareOp>
@@ -109,10 +118,19 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step(
   cudaStream_t stream,
   thrust::detail::integral_constant<bool, true> /* sort_items */)
 {
-  using DispatchMergeSortT = cub::DispatchMergeSort<KeysIt, ItemsIt, KeysIt, ItemsIt, Size, CompareOp>;
+  cudaError_t status = cudaSuccess;
+
+  using dispatch32_t = cub::DispatchMergeSort<KeysIt, ItemsIt, KeysIt, ItemsIt, std::uint32_t, CompareOp>;
+  using dispatch64_t = cub::DispatchMergeSort<KeysIt, ItemsIt, KeysIt, ItemsIt, std::uint64_t, CompareOp>;
+
+  THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
+    status,
+    dispatch32_t::Dispatch,
+    dispatch64_t::Dispatch,
+    keys_count,
+    (d_temp_storage, temp_storage_bytes, keys, items, keys, items, keys_count_fixed, compare_op, stream));
 
-  return DispatchMergeSortT::Dispatch(
-    d_temp_storage, temp_storage_bytes, keys, items, keys, items, keys_count, compare_op, stream);
+  return status;
 }
 
 template <class SORT_ITEMS, class /* STABLE */, class KeysIt, class ItemsIt, class Size, class CompareOp>
diff --git a/thrust/thrust/system/cuda/detail/terminate.h b/thrust/thrust/system/cuda/detail/terminate.h
index d5989dd90b5..7f4e8dc1a74 100644
--- a/thrust/thrust/system/cuda/detail/terminate.h
+++ b/thrust/thrust/system/cuda/detail/terminate.h
@@ -36,7 +36,8 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/system/cuda/detail/util.h>
+
+#include <cuda/std/__exception/terminate.h>
 
 #include <cstdio>
 
@@ -47,18 +48,11 @@ namespace cuda
 {
 namespace detail
 {
-
-inline _CCCL_DEVICE void terminate()
-{
-  thrust::cuda_cub::terminate();
-}
-
 inline _CCCL_HOST_DEVICE void terminate_with_message(const char* message)
 {
   printf("%s\n", message);
-  thrust::cuda_cub::terminate();
+  ::cuda::std::terminate();
 }
-
 } // namespace detail
 } // namespace cuda
 } // namespace system
diff --git a/thrust/thrust/system/cuda/detail/transform_reduce.h b/thrust/thrust/system/cuda/detail/transform_reduce.h
index 7306a8ef34a..fbb3054b0bf 100644
--- a/thrust/thrust/system/cuda/detail/transform_reduce.h
+++ b/thrust/thrust/system/cuda/detail/transform_reduce.h
@@ -43,7 +43,6 @@
 #  include <cub/util_math.cuh>
 
 #  include <thrust/detail/alignment.h>
-#  include <thrust/detail/minmax.h>
 #  include <thrust/detail/raw_reference_cast.h>
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/detail/type_traits/iterator/is_output_iterator.h>
diff --git a/thrust/thrust/system/cuda/detail/transform_scan.h b/thrust/thrust/system/cuda/detail/transform_scan.h
index d015f0d4fd5..ed94edc7d47 100644
--- a/thrust/thrust/system/cuda/detail/transform_scan.h
+++ b/thrust/thrust/system/cuda/detail/transform_scan.h
@@ -41,6 +41,8 @@
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/scan.h>
 
+#  include <cuda/std/type_traits>
+
 #  include <iterator>
 
 THRUST_NAMESPACE_BEGIN
@@ -60,7 +62,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
   // Use the transformed input iterator's value type per https://wg21.link/P0571
   using input_type  = typename thrust::iterator_value<InputIt>::type;
   using result_type = thrust::detail::invoke_result_t<TransformOp, input_type>;
-  using value_type  = thrust::remove_cvref_t<result_type>;
+  using value_type  = ::cuda::std::remove_cvref_t<result_type>;
 
   using size_type              = typename iterator_traits<InputIt>::difference_type;
   size_type num_items          = static_cast<size_type>(thrust::distance(first, last));
@@ -81,7 +83,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
 {
   using input_type  = typename thrust::iterator_value<InputIt>::type;
   using result_type = thrust::detail::invoke_result_t<TransformOp, input_type>;
-  using value_type  = thrust::remove_cvref_t<result_type>;
+  using value_type  = ::cuda::std::remove_cvref_t<result_type>;
 
   using size_type              = typename iterator_traits<InputIt>::difference_type;
   size_type num_items          = static_cast<size_type>(thrust::distance(first, last));
@@ -102,7 +104,7 @@ OutputIt _CCCL_HOST_DEVICE transform_exclusive_scan(
   ScanOp scan_op)
 {
   // Use the initial value type per https://wg21.link/P0571
-  using result_type = thrust::remove_cvref_t<InitialValueType>;
+  using result_type = ::cuda::std::remove_cvref_t<InitialValueType>;
 
   using size_type              = typename iterator_traits<InputIt>::difference_type;
   size_type num_items          = static_cast<size_type>(thrust::distance(first, last));
diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h
index 49d8e2668b6..b8e408254cb 100644
--- a/thrust/thrust/system/cuda/detail/unique.h
+++ b/thrust/thrust/system/cuda/detail/unique.h
@@ -45,7 +45,6 @@
 
 #  include <thrust/advance.h>
 #  include <thrust/count.h>
-#  include <thrust/detail/minmax.h>
 #  include <thrust/distance.h>
 #  include <thrust/functional.h>
 #  include <thrust/system/cuda/detail/cdp_dispatch.h>
@@ -532,7 +531,7 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   status = tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
   CUDA_CUB_RET_IF_FAIL(status);
 
-  num_tiles = max<size_t>(1, num_tiles);
+  num_tiles = ::cuda::std::max<size_t>(1, num_tiles);
   init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent");
   ia.launch(tile_status, num_tiles, num_selected_out);
   CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
diff --git a/thrust/thrust/system/cuda/detail/unique_by_key.h b/thrust/thrust/system/cuda/detail/unique_by_key.h
index 5b88b9935b3..8ab790933fb 100644
--- a/thrust/thrust/system/cuda/detail/unique_by_key.h
+++ b/thrust/thrust/system/cuda/detail/unique_by_key.h
@@ -44,7 +44,6 @@
 #  include <cub/util_math.cuh>
 
 #  include <thrust/detail/alignment.h>
-#  include <thrust/detail/minmax.h>
 #  include <thrust/detail/mpl/math.h>
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/distance.h>
diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h
index 49e84b8025b..ff29711c400 100644
--- a/thrust/thrust/system/cuda/detail/util.h
+++ b/thrust/thrust/system/cuda/detail/util.h
@@ -38,7 +38,6 @@
 
 #include <cub/config.cuh>
 
-#include <cub/detail/device_synchronize.cuh>
 #include <cub/util_device.cuh>
 
 #include <thrust/iterator/iterator_traits.h>
@@ -178,7 +177,7 @@ trivial_copy_device_to_device(Policy& policy, Type* dst, Type const* src, size_t
   return status;
 }
 
-inline void _CCCL_HOST_DEVICE terminate()
+CCCL_DEPRECATED_BECAUSE("Use cuda::std::terminate() instead") inline void _CCCL_HOST_DEVICE terminate()
 {
   NV_IF_TARGET(NV_IS_HOST, (std::terminate();), (asm("trap;");));
 }
@@ -210,7 +209,7 @@ _CCCL_HOST_DEVICE inline void throw_on_error(cudaError_t status)
 
     NV_IF_TARGET(NV_IS_HOST,
                  (throw thrust::system_error(status, thrust::cuda_category());),
-                 (THRUST_TEMP_DEVICE_CODE; cuda_cub::terminate();));
+                 (THRUST_TEMP_DEVICE_CODE; ::cuda::std::terminate();));
 
 #undef THRUST_TEMP_DEVICE_CODE
   }
@@ -243,7 +242,7 @@ _CCCL_HOST_DEVICE inline void throw_on_error(cudaError_t status, char const* msg
 
     NV_IF_TARGET(NV_IS_HOST,
                  (throw thrust::system_error(status, thrust::cuda_category(), msg);),
-                 (THRUST_TEMP_DEVICE_CODE; cuda_cub::terminate();));
+                 (THRUST_TEMP_DEVICE_CODE; ::cuda::std::terminate();));
 
 #undef THRUST_TEMP_DEVICE_CODE
   }
diff --git a/thrust/thrust/system/detail/generic/copy_if.inl b/thrust/thrust/system/detail/generic/copy_if.inl
index 6ccc52732c1..00075933e7c 100644
--- a/thrust/thrust/system/detail/generic/copy_if.inl
+++ b/thrust/thrust/system/detail/generic/copy_if.inl
@@ -26,7 +26,6 @@
 #  pragma system_header
 #endif // no system header
 #include <thrust/detail/copy_if.h>
-#include <thrust/detail/integer_traits.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits.h>
@@ -39,6 +38,8 @@
 #include <thrust/system/detail/generic/copy_if.h>
 #include <thrust/transform.h>
 
+#include <cuda/std/limits>
+
 #include <limits>
 
 THRUST_NAMESPACE_BEGIN
@@ -137,8 +138,7 @@ _CCCL_HOST_DEVICE OutputIterator copy_if(
   ::cuda::std::make_unsigned_t<difference_type> unsigned_n(n);
 
   // use 32-bit indices when possible (almost always)
-  if (sizeof(difference_type) > sizeof(unsigned int)
-      && unsigned_n > thrust::detail::integer_traits<unsigned int>::const_max)
+  if (sizeof(difference_type) > sizeof(unsigned int) && unsigned_n > ::cuda::std::numeric_limits<unsigned int>::max())
   {
     result = detail::copy_if<difference_type>(exec, first, last, stencil, result, pred);
   } // end if
diff --git a/thrust/thrust/system/detail/generic/find.inl b/thrust/thrust/system/detail/generic/find.inl
index 25eeed24e9f..09ec0b8c59e 100644
--- a/thrust/thrust/system/detail/generic/find.inl
+++ b/thrust/thrust/system/detail/generic/find.inl
@@ -26,7 +26,6 @@
 #  pragma system_header
 #endif // no system header
 #include <thrust/detail/internal_functional.h>
-#include <thrust/detail/minmax.h>
 #include <thrust/find.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -34,6 +33,8 @@
 #include <thrust/reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/std/__algorithm/min.h>
+
 // Contributed by Erich Elsen
 
 THRUST_NAMESPACE_BEGIN
@@ -61,7 +62,7 @@ struct find_if_functor
     // select the smallest index among true results
     if (thrust::get<0>(lhs) && thrust::get<0>(rhs))
     {
-      return TupleType(true, (thrust::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+      return TupleType(true, (::cuda::std::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
     }
     else if (thrust::get<0>(lhs))
     {
@@ -94,7 +95,7 @@ find_if(thrust::execution_policy<DerivedPolicy>& exec, InputIterator first, Inpu
 
   // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
   const difference_type interval_threshold = 1 << 20;
-  const difference_type interval_size      = (thrust::min)(interval_threshold, n);
+  const difference_type interval_size      = (::cuda::std::min)(interval_threshold, n);
 
   // force transform_iterator output to bool
   using XfrmIterator  = thrust::transform_iterator<Predicate, InputIterator, bool>;
diff --git a/thrust/thrust/system/detail/generic/transform_scan.inl b/thrust/thrust/system/detail/generic/transform_scan.inl
index 5f70c79c070..fbd5f24b21e 100644
--- a/thrust/thrust/system/detail/generic/transform_scan.inl
+++ b/thrust/thrust/system/detail/generic/transform_scan.inl
@@ -30,7 +30,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 #include <thrust/system/detail/generic/transform_scan.h>
-#include <thrust/type_traits/remove_cvref.h>
+
+#include <cuda/std/type_traits>
 
 THRUST_NAMESPACE_BEGIN
 namespace system
@@ -56,7 +57,7 @@ _CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
   // Use the input iterator's value type per https://wg21.link/P0571
   using InputType  = typename thrust::iterator_value<InputIterator>::type;
   using ResultType = thrust::detail::invoke_result_t<UnaryFunction, InputType>;
-  using ValueType  = thrust::remove_cvref_t<ResultType>;
+  using ValueType  = ::cuda::std::remove_cvref_t<ResultType>;
 
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
@@ -81,7 +82,7 @@ _CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
 {
   using InputType  = typename thrust::iterator_value<InputIterator>::type;
   using ResultType = thrust::detail::invoke_result_t<UnaryFunction, InputType>;
-  using ValueType  = thrust::remove_cvref_t<ResultType>;
+  using ValueType  = ::cuda::std::remove_cvref_t<ResultType>;
 
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
@@ -105,7 +106,7 @@ _CCCL_HOST_DEVICE OutputIterator transform_exclusive_scan(
   AssociativeOperator binary_op)
 {
   // Use the initial value type per https://wg21.link/P0571
-  using ValueType = thrust::remove_cvref_t<InitialValueType>;
+  using ValueType = ::cuda::std::remove_cvref_t<InitialValueType>;
 
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
diff --git a/thrust/thrust/system/detail/sequential/stable_merge_sort.inl b/thrust/thrust/system/detail/sequential/stable_merge_sort.inl
index 415ff2a70a9..e987540c0e2 100644
--- a/thrust/thrust/system/detail/sequential/stable_merge_sort.inl
+++ b/thrust/thrust/system/detail/sequential/stable_merge_sort.inl
@@ -26,12 +26,13 @@
 #  pragma system_header
 #endif // no system header
 
-#include <thrust/detail/minmax.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/merge.h>
 #include <thrust/system/detail/sequential/insertion_sort.h>
 
+#include <cuda/std/__algorithm/min.h>
+
 #include <nv/target>
 
 THRUST_NAMESPACE_BEGIN
@@ -95,7 +96,7 @@ insertion_sort_each(RandomAccessIterator first, RandomAccessIterator last, Size
   {
     for (; first < last; first += partition_size)
     {
-      RandomAccessIterator partition_last = (thrust::min)(last, first + partition_size);
+      RandomAccessIterator partition_last = (::cuda::std::min)(last, first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort(first, partition_last, comp);
     } // end for
@@ -114,7 +115,7 @@ _CCCL_HOST_DEVICE void insertion_sort_each_by_key(
   {
     for (; keys_first < keys_last; keys_first += partition_size, values_first += partition_size)
     {
-      RandomAccessIterator1 keys_partition_last = (thrust::min)(keys_last, keys_first + partition_size);
+      RandomAccessIterator1 keys_partition_last = (::cuda::std::min)(keys_last, keys_first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort_by_key(keys_first, keys_partition_last, values_first, comp);
     } // end for
@@ -136,8 +137,8 @@ _CCCL_HOST_DEVICE void merge_adjacent_partitions(
 {
   for (; first < last; first += 2 * partition_size, result += 2 * partition_size)
   {
-    RandomAccessIterator1 interval_middle = (thrust::min)(last, first + partition_size);
-    RandomAccessIterator1 interval_last   = (thrust::min)(last, interval_middle + partition_size);
+    RandomAccessIterator1 interval_middle = (::cuda::std::min)(last, first + partition_size);
+    RandomAccessIterator1 interval_last   = (::cuda::std::min)(last, interval_middle + partition_size);
 
     thrust::merge(exec, first, interval_middle, interval_middle, interval_last, result, comp);
   } // end for
@@ -165,8 +166,8 @@ _CCCL_HOST_DEVICE void merge_adjacent_partitions_by_key(
   for (; keys_first < keys_last;
        keys_first += stride, values_first += stride, keys_result += stride, values_result += stride)
   {
-    RandomAccessIterator1 keys_interval_middle = (thrust::min)(keys_last, keys_first + partition_size);
-    RandomAccessIterator1 keys_interval_last   = (thrust::min)(keys_last, keys_interval_middle + partition_size);
+    RandomAccessIterator1 keys_interval_middle = (::cuda::std::min)(keys_last, keys_first + partition_size);
+    RandomAccessIterator1 keys_interval_last   = (::cuda::std::min)(keys_last, keys_interval_middle + partition_size);
 
     RandomAccessIterator2 values_first2 = values_first + (keys_interval_middle - keys_first);
 
diff --git a/thrust/thrust/system/tbb/detail/reduce_by_key.inl b/thrust/thrust/system/tbb/detail/reduce_by_key.inl
index f799dfa6087..96c4630ae8f 100644
--- a/thrust/thrust/system/tbb/detail/reduce_by_key.inl
+++ b/thrust/thrust/system/tbb/detail/reduce_by_key.inl
@@ -25,7 +25,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/minmax.h>
 #include <thrust/detail/range/tail_flags.h>
 #include <thrust/detail/seq.h>
 #include <thrust/detail/temporary_array.h>
@@ -36,6 +35,8 @@
 #include <thrust/system/tbb/detail/reduce_by_key.h>
 #include <thrust/system/tbb/detail/reduce_intervals.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/__type_traits/void_t.h>
 
 #include <cassert>
@@ -197,7 +198,7 @@ struct serial_reduce_by_key_body
     const size_type interval_idx = r.begin();
 
     const size_type offset_to_first = interval_size * interval_idx;
-    const size_type offset_to_last  = (thrust::min)(n, offset_to_first + interval_size);
+    const size_type offset_to_last  = ::cuda::std::min(n, offset_to_first + interval_size);
 
     Iterator1 my_keys_first    = keys_first + offset_to_first;
     Iterator1 my_keys_last     = keys_first + offset_to_last;
@@ -319,13 +320,13 @@ thrust::pair<Iterator3, Iterator4> reduce_by_key(
   }
 
   // count the number of processors
-  const unsigned int p = thrust::max<unsigned int>(1u, std::thread::hardware_concurrency());
+  const unsigned int p = ::cuda::std::max<unsigned int>(1u, std::thread::hardware_concurrency());
 
   // generate O(P) intervals of sequential work
   // XXX oversubscribing is a tuning opportunity
   const unsigned int subscription_rate = 1;
-  difference_type interval_size =
-    thrust::min<difference_type>(parallelism_threshold, thrust::max<difference_type>(n, n / (subscription_rate * p)));
+  difference_type interval_size        = ::cuda::std::min<difference_type>(
+    parallelism_threshold, ::cuda::std::max<difference_type>(n, n / (subscription_rate * p)));
   difference_type num_intervals = reduce_by_key_detail::divide_ri(n, interval_size);
 
   // decompose the input into intervals of size N / num_intervals
diff --git a/thrust/thrust/system/tbb/detail/reduce_intervals.h b/thrust/thrust/system/tbb/detail/reduce_intervals.h
index 6270650b04d..d29b830823b 100644
--- a/thrust/thrust/system/tbb/detail/reduce_intervals.h
+++ b/thrust/thrust/system/tbb/detail/reduce_intervals.h
@@ -25,13 +25,14 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/minmax.h>
 #include <thrust/detail/seq.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/reduce.h>
 #include <thrust/system/cpp/memory.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
+#include <cuda/std/__algorithm/min.h>
+
 #include <cassert>
 #include <type_traits>
 
@@ -76,7 +77,7 @@ struct body
     Size interval_idx = r.begin();
 
     Size offset_to_first = interval_size * interval_idx;
-    Size offset_to_last  = (thrust::min)(n, offset_to_first + interval_size);
+    Size offset_to_last  = (::cuda::std::min)(n, offset_to_first + interval_size);
 
     RandomAccessIterator1 my_first = first + offset_to_first;
     RandomAccessIterator1 my_last  = first + offset_to_last;
diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
index f54e814baa6..ff57db3fd97 100644
--- a/thrust/thrust/tuple.h
+++ b/thrust/thrust/tuple.h
@@ -44,9 +44,11 @@
 THRUST_NAMESPACE_BEGIN
 
 // define null_type for backwards compatibility
-struct null_type
+struct CCCL_DEPRECATED_BECAUSE("Please remove null_type from parameters to tuple<...>") null_type
 {};
 
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+
 _CCCL_HOST_DEVICE inline bool operator==(const null_type&, const null_type&)
 {
   return true;
@@ -77,6 +79,8 @@ _CCCL_HOST_DEVICE inline bool operator>(const null_type&, const null_type&)
   return false;
 }
 
+_CCCL_SUPPRESS_DEPRECATED_POP
+
 /*! \addtogroup utility
  *  \{
  */
@@ -176,6 +180,7 @@ using _CUDA_VSTD::tie;
 THRUST_NAMESPACE_END
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
+_CCCL_SUPPRESS_DEPRECATED_PUSH
 
 template <>
 struct tuple_size<tuple<THRUST_NS_QUALIFIER::null_type,
@@ -284,4 +289,5 @@ struct tuple_size<tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8, THRUST_NS_QUALIFIER:
     : tuple_size<tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8>>
 {};
 
+_CCCL_SUPPRESS_DEPRECATED_POP
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/thrust/thrust/type_traits/remove_cvref.h b/thrust/thrust/type_traits/remove_cvref.h
deleted file mode 100644
index 4cc9a0816f5..00000000000
--- a/thrust/thrust/type_traits/remove_cvref.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  Copyright 2018-2021 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file
- *  \brief C++20's
- *  <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if _CCCL_HAS_INCLUDE(<version>)
-#  include <version>
-#endif // _CCCL_HAS_INCLUDE(<version>)
-
-#include <type_traits>
-
-THRUST_NAMESPACE_BEGIN
-
-/*! \addtogroup utility
- *  \{
- */
-
-/*! \addtogroup type_traits Type Traits
- *  \{
- */
-
-/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
- *  that removes
- *  <a href="https://en.cppreference.com/w/cpp/language/cv">const-volatile qualifiers</a>
- *  and
- *  <a href="https://en.cppreference.com/w/cpp/language/reference">references</a>
- *  from \c T.
- *  Equivalent to \c remove_cv_t<remove_reference_t<T>>.
- *
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_cv</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_const</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_volatile</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_reference</a>
- */
-#if defined(__cpp_lib_remove_cvref) && (__cpp_lib_remove_cvref >= 201711L)
-using std::remove_cvref;
-#else // Older than C++20.
-template <typename T>
-struct remove_cvref
-{
-  using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
-};
-#endif
-
-/*! \brief Type alias that removes
- *  <a href="https://en.cppreference.com/w/cpp/language/cv">const-volatile qualifiers</a>
- *  and
- *  <a href="https://en.cppreference.com/w/cpp/language/reference">references</a>
- *  from \c T.
- *  Equivalent to \c remove_cv_t<remove_reference_t<T>>.
- *
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_cv</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_const</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_volatile</a>
- *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_reference</a>
- */
-#if defined(__cpp_lib_remove_cvref) && (__cpp_lib_remove_cvref >= 201711L)
-using std::remove_cvref_t;
-#else // Older than C++20.
-template <typename T>
-using remove_cvref_t = typename remove_cvref<T>::type;
-#endif
-
-/*! \} // type traits
- */
-
-/*! \} // utility
- */
-
-THRUST_NAMESPACE_END