diff --git a/.clangd b/.clangd index 49cceb761a7..2963e5891df 100644 --- a/.clangd +++ b/.clangd @@ -51,6 +51,7 @@ CompileFlags: # strip CUDA flags unknown to clang - "-ccbin*" - "--compiler-options*" + - "--extended-lambda" - "--expt-extended-lambda" - "--expt-relaxed-constexpr" - "-forward-unknown-to-host-compiler" diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 24864c6631a..65a57ee3469 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,20 +1,29 @@ -# general codeowners for all files -# (Order matters. This needs to be at the top) -* @nvidia/cccl-codeowners - # Libraries -thrust/ @nvidia/cccl-thrust-codeowners @nvidia/cccl-codeowners -cub/ @nvidia/cccl-cub-codeowners @nvidia/cccl-codeowners -libcudacxx/ @nvidia/cccl-libcudacxx-codeowners @nvidia/cccl-codeowners +thrust/ @nvidia/cccl-thrust-codeowners +cub/ @nvidia/cccl-cub-codeowners +libcudacxx/ @nvidia/cccl-libcudacxx-codeowners cudax/ @nvidia/cccl-cudax-codeowners c/ @nvidia/cccl-c-codeowners python/ @nvidia/cccl-python-codeowners # Infrastructure -.github/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners -ci/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners -.devcontainer/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners +.github/ @nvidia/cccl-infra-codeowners +ci/ @nvidia/cccl-infra-codeowners +.devcontainer/ @nvidia/cccl-infra-codeowners +.pre-commit-config.yaml @nvidia/cccl-infra-codeowners +.clang-format @nvidia/cccl-infra-codeowners +.clangd @nvidia/cccl-infra-codeowners +c2h/ @nvidia/cccl-infra-codeowners +.vscode @nvidia/cccl-infra-codeowners # cmake -**/CMakeLists.txt @nvidia/cccl-cmake-codeowners @nvidia/cccl-codeowners -**/cmake/ @nvidia/cccl-cmake-codeowners @nvidia/cccl-codeowners +**/CMakeLists.txt @nvidia/cccl-cmake-codeowners +**/cmake/ @nvidia/cccl-cmake-codeowners + +# benchmarks +benchmarks/ @nvidia/cccl-benchmark-codeowners +**/benchmarks @nvidia/cccl-benchmark-codeowners + +# docs +docs/ @nvidia/cccl-docs-codeowners +examples/ @nvidia/cccl-docs-codeowners diff --git a/.github/actions/docs-build/action.yml b/.github/actions/docs-build/action.yml index db7f3231742..bf2e3077ebb 100644 --- a/.github/actions/docs-build/action.yml +++ b/.github/actions/docs-build/action.yml @@ -54,4 +54,4 @@ runs: # Upload docs as pages artifacts - name: Upload artifact if: ${{ inputs.upload_pages_artifact == 'true' }} - uses: actions/upload-pages-artifact@v2 + uses: actions/upload-pages-artifact@v3 diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index 2b1cff7b4f2..2f655b4fddc 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -45,4 +45,4 @@ jobs: steps: - name: Deploy to GitHub Pages id: deployment - uses: actions/deploy-pages@v2 + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/build-rapids.yml b/.github/workflows/build-rapids.yml index aaee38b05e7..4ee586d0121 100644 --- a/.github/workflows/build-rapids.yml +++ b/.github/workflows/build-rapids.yml @@ -134,6 +134,12 @@ jobs: sccache --show-adv-stats done done + + # Exit with error if any failures occurred + if test ${#failures[@]} -ne 0; then + exit 1 + fi + EOF chmod +x "$RUNNER_TEMP"/ci{,-entrypoint}.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d317e931e78..e61d2f349ea 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,6 +43,17 @@ repos: hooks: - id: ruff # linter - id: ruff-format # formatter + + # TOML lint & format + - repo: https://github.com/ComPWA/taplo-pre-commit + rev: v0.9.3 + hooks: + # See https://github.com/NVIDIA/cccl/issues/3426 + # - id: taplo-lint + # exclude: "^docs/" + - id: taplo-format + exclude: "^docs/" + - repo: https://github.com/codespell-project/codespell rev: v2.3.0 hooks: diff --git a/CMakePresets.json b/CMakePresets.json index bd10a95200b..dcaf9b75977 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -73,8 +73,6 @@ "CUB_ENABLE_DIALECT_CPP20": true, "THRUST_ENABLE_MULTICONFIG": true, "THRUST_MULTICONFIG_WORKLOAD": "LARGE", - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": true, - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP": true, @@ -128,28 +126,6 @@ "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true } }, - { - "name": "libcudacxx-cpp11", - "displayName": "libcu++: C++11", - "inherits": "libcudacxx-base", - "cacheVariables": { - "CMAKE_CXX_STANDARD": "11", - "CMAKE_CUDA_STANDARD": "11", - "LIBCUDACXX_TEST_STANDARD_VER": "c++11", - "CCCL_IGNORE_DEPRECATED_CPP_11": true - } - }, - { - "name": "libcudacxx-cpp14", - "displayName": "libcu++: C++14", - "inherits": "libcudacxx-base", - "cacheVariables": { - "CMAKE_CXX_STANDARD": "14", - "CMAKE_CUDA_STANDARD": "14", - "LIBCUDACXX_TEST_STANDARD_VER": "c++14", - "CCCL_IGNORE_DEPRECATED_CPP_14": true - } - }, { "name": "libcudacxx-cpp17", "displayName": "libcu++: C++17", @@ -179,28 +155,6 @@ "CMAKE_CUDA_ARCHITECTURES": "70" } }, - { - "name": "libcudacxx-nvrtc-cpp11", - "displayName": "libcu++ NVRTC: C++11", - "inherits": "libcudacxx-nvrtc-base", - "cacheVariables": { - "CMAKE_CXX_STANDARD": "11", - "CMAKE_CUDA_STANDARD": "11", - "LIBCUDACXX_TEST_STANDARD_VER": "c++11", - "CCCL_IGNORE_DEPRECATED_CPP_11": true - } - }, - { - "name": "libcudacxx-nvrtc-cpp14", - "displayName": "libcu++ NVRTC: C++14", - "inherits": "libcudacxx-nvrtc-base", - "cacheVariables": { - "CMAKE_CXX_STANDARD": "14", - "CMAKE_CUDA_STANDARD": "14", - "LIBCUDACXX_TEST_STANDARD_VER": "c++14", - "CCCL_IGNORE_DEPRECATED_CPP_14": true - } - }, { "name": "libcudacxx-nvrtc-cpp17", "displayName": "libcu++ NVRTC: C++17", @@ -261,8 +215,6 @@ "THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true, - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": false, - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": false, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17": false, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": false } @@ -420,22 +372,6 @@ "libcudacxx.test.atomics.ptx" ] }, - { - "name": "libcudacxx-nvrtc-cpp11", - "hidden": false, - "inherits": [ - "libcudacxx-nvrtcc" - ], - "configurePreset": "libcudacxx-nvrtc-cpp11" - }, - { - "name": "libcudacxx-nvrtc-cpp14", - "hidden": false, - "inherits": [ - "libcudacxx-nvrtcc" - ], - "configurePreset": "libcudacxx-nvrtc-cpp14" - }, { "name": "libcudacxx-nvrtc-cpp17", "hidden": false, @@ -452,20 +388,6 @@ ], "configurePreset": "libcudacxx-nvrtc-cpp20" }, - { - "name": "libcudacxx-cpp11", - "configurePreset": "libcudacxx-cpp11", - "inherits": [ - "libcudacxx-base" - ] - }, - { - "name": "libcudacxx-cpp14", - "configurePreset": "libcudacxx-cpp14", - "inherits": [ - "libcudacxx-base" - ] - }, { "name": "libcudacxx-cpp17", "configurePreset": "libcudacxx-cpp17", @@ -572,20 +494,6 @@ "outputOnFailure": false } }, - { - "name": "libcudacxx-lit-cpp11", - "configurePreset": "libcudacxx-cpp11", - "inherits": [ - "libcudacxx-lit-base" - ] - }, - { - "name": "libcudacxx-lit-cpp14", - "configurePreset": "libcudacxx-cpp14", - "inherits": [ - "libcudacxx-lit-base" - ] - }, { "name": "libcudacxx-lit-cpp17", "configurePreset": "libcudacxx-cpp17", @@ -607,20 +515,6 @@ "libcudacxx-lit-base" ] }, - { - "name": "libcudacxx-nvrtc-cpp11", - "configurePreset": "libcudacxx-nvrtc-cpp11", - "inherits": [ - "libcudacxx-nvrtc-base" - ] - }, - { - "name": "libcudacxx-nvrtc-cpp14", - "configurePreset": "libcudacxx-nvrtc-cpp14", - "inherits": [ - "libcudacxx-nvrtc-base" - ] - }, { "name": "libcudacxx-nvrtc-cpp17", "configurePreset": "libcudacxx-nvrtc-cpp17", diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 703a7ead85b..54627d06868 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -160,7 +160,7 @@ std::string get_single_tile_kernel_name( check(nvrtcGetTypeName(&reduction_op_t)); return std::format( - "cub::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>", + "cub::detail::reduce::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>", chained_policy_t, input_iterator_t, output_iterator_t, @@ -192,7 +192,7 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it check(nvrtcGetTypeName(&transform_op_t)); return std::format( - "cub::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>", + "cub::detail::reduce::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>", chained_policy_t, input_iterator_t, offset_t, diff --git a/c/parallel/test/test_main.cpp b/c/parallel/test/test_main.cpp index 3e3b4900a5d..d1fb01d96bd 100644 --- a/c/parallel/test/test_main.cpp +++ b/c/parallel/test/test_main.cpp @@ -12,8 +12,7 @@ #include -#define CATCH_CONFIG_RUNNER -#include +#include int device_guard(int device_id) { @@ -40,7 +39,7 @@ int main(int argc, char* argv[]) int device_id{}; // Build a new parser on top of Catch's - using namespace Catch::clara; + using namespace Catch::Clara; auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use"); session.cli(cli); diff --git a/c/parallel/test/test_util.h b/c/parallel/test/test_util.h index 3f7010a3e36..456a717c4d8 100644 --- a/c/parallel/test/test_util.h +++ b/c/parallel/test/test_util.h @@ -22,7 +22,9 @@ #include #include -#include +#include +#include +#include #include #include diff --git a/c2h/include/c2h/catch2_main.h b/c2h/include/c2h/catch2_main.h index dc1fa2eba16..8005d33a649 100644 --- a/c2h/include/c2h/catch2_main.h +++ b/c2h/include/c2h/catch2_main.h @@ -36,13 +36,9 @@ //! executable, this header is included into each test. On the other hand, when all the tests are compiled into a single //! executable, this header is excluded from the tests and included into catch2_runner.cpp -#ifdef CUB_CONFIG_MAIN -# define CATCH_CONFIG_RUNNER -#endif - -#include +#include -#if defined(CUB_CONFIG_MAIN) +#ifdef CUB_CONFIG_MAIN # if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA # include @@ -59,7 +55,7 @@ int main(int argc, char* argv[]) int device_id{}; // Build a new parser on top of Catch's - using namespace Catch::clara; + using namespace Catch::Clara; auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use"); session.cli(cli); @@ -73,4 +69,4 @@ int main(int argc, char* argv[]) # endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA return session.run(argc, argv); } -#endif +#endif // CUB_CONFIG_MAIN diff --git a/c2h/include/c2h/catch2_test_helper.h b/c2h/include/c2h/catch2_test_helper.h index 729c1f43ebd..585b24a70b6 100644 --- a/c2h/include/c2h/catch2_test_helper.h +++ b/c2h/include/c2h/catch2_test_helper.h @@ -39,15 +39,37 @@ #include #include -#if __CUDACC_VER_MAJOR__ == 11 -_CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variables -#endif // nvcc-11 - #include #include #include #include #include +#include +#include +#include +#include +#include + +// workaround for error #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop' +#if _CCCL_COMPILER(NVHPC) +# undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +# undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma("diag push") +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma("diag pop") +#endif +// workaround for error +// * MSVC14.39: #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop' +// * MSVC14.29: internal error: assertion failed: alloc_copy_of_pending_pragma: copied pragma has source sequence entry +// (pragma.c, line 526 in alloc_copy_of_pending_pragma) +// see also upstream Catch2 issue: https://github.com/catchorg/Catch2/issues/2636 +#if _CCCL_COMPILER(MSVC) +# undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +# undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +# undef CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +# define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS +#endif #ifndef VAR_IDX # define VAR_IDX 0 @@ -110,11 +132,11 @@ std::vector to_vec(std::vector const& vec) } } // namespace detail -#define REQUIRE_APPROX_EQ(ref, out) \ - { \ - auto vec_ref = detail::to_vec(ref); \ - auto vec_out = detail::to_vec(out); \ - REQUIRE_THAT(vec_ref, Catch::Approx(vec_out)); \ +#define REQUIRE_APPROX_EQ(ref, out) \ + { \ + auto vec_ref = detail::to_vec(ref); \ + auto vec_out = detail::to_vec(out); \ + REQUIRE_THAT(vec_ref, Catch::Matchers::Approx(vec_out)); \ } namespace detail @@ -140,7 +162,7 @@ struct bitwise_equal // Catch2 Matcher that calls `std::equal` with a default-constructable custom predicate template -struct CustomEqualsRangeMatcher : Catch::MatcherBase +struct CustomEqualsRangeMatcher : Catch::Matchers::MatcherBase { CustomEqualsRangeMatcher(Range const& range) : range{range} diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 881f553f65d..3441ab07b15 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -13,12 +13,11 @@ workflows: # Old CTK/compiler - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang14', 'msvc2019']} # Current CTK build-only - - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang14'], project: 'libcudacxx'} - - {jobs: ['build'], std: [17], cxx: ['gcc7', 'clang14']} + - {jobs: ['build'], std: 17, cxx: ['gcc7', 'clang14']} - {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']} - {jobs: ['build'], std: 'max', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], std: 'max', cxx: ['msvc2019']} - - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']} + - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']} # Current CTK testing: - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']} - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']} @@ -28,13 +27,13 @@ workflows: - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']} - {jobs: ['test_lid0'], project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100', sm: 'gpu' } # Modded builds: - - {jobs: ['build'], std: [17, 20], ctk: '12.5', cxx: 'nvhpc'} + - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'} # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly. - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} # default_projects: clang-cuda - - {jobs: ['build'], std: [17, 20], cudacxx: 'clang', cxx: 'clang'} + - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'} # nvrtc: @@ -45,11 +44,11 @@ workflows: - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20, cxx: ['msvc14.36']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc10', 'gcc11', 'gcc12']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: [17, 20], cxx: ['nvhpc']} + - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: 'all', cxx: ['nvhpc']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['msvc2022']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17, cxx: ['gcc'], sm: "90"} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc'], sm: "90a"} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: [17, 20], cxx: ['gcc', 'clang'], cpu: 'arm64'} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc12', 'clang', 'msvc']} # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'} @@ -122,12 +121,12 @@ workflows: devcontainer_version: '25.02' # All supported C++ standards: -all_stds: [11, 14, 17, 20] +all_stds: [17, 20] ctk_versions: - 12.0: { stds: [11, 14, 17, 20] } - 12.5: { stds: [11, 14, 17, 20] } - 12.6: { stds: [11, 14, 17, 20], aka: 'curr' } + 12.0: { stds: [17, 20] } + 12.5: { stds: [17, 20] } + 12.6: { stds: [17, 20], aka: 'curr' } device_compilers: nvcc: # Version / stds are taken from CTK @@ -143,37 +142,37 @@ host_compilers: container_tag: 'gcc' exe: 'g++' versions: - 7: { stds: [11, 14, 17, ] } - 8: { stds: [11, 14, 17, ] } - 9: { stds: [11, 14, 17, ] } - 10: { stds: [11, 14, 17, 20] } - 11: { stds: [11, 14, 17, 20] } - 12: { stds: [11, 14, 17, 20] } - 13: { stds: [11, 14, 17, 20] } + 7: { stds: [17, ] } + 8: { stds: [17, ] } + 9: { stds: [17, ] } + 10: { stds: [17, 20] } + 11: { stds: [17, 20] } + 12: { stds: [17, 20] } + 13: { stds: [17, 20] } clang: name: 'Clang' container_tag: 'llvm' exe: 'clang++' versions: - 14: { stds: [11, 14, 17, 20] } - 15: { stds: [11, 14, 17, 20] } - 16: { stds: [11, 14, 17, 20] } - 17: { stds: [11, 14, 17, 20] } - 18: { stds: [11, 14, 17, 20] } + 14: { stds: [17, 20] } + 15: { stds: [17, 20] } + 16: { stds: [17, 20] } + 17: { stds: [17, 20] } + 18: { stds: [17, 20] } msvc: name: 'MSVC' container_tag: 'cl' exe: cl versions: - 14.29: { stds: [ 14, 17, ], aka: '2019' } - 14.36: { stds: [ 14, 17, 20] } - 14.39: { stds: [ 14, 17, 20], aka: '2022' } + 14.29: { stds: [ 17, ], aka: '2019' } + 14.36: { stds: [ 17, 20] } + 14.39: { stds: [ 17, 20], aka: '2022' } nvhpc: name: 'NVHPC' container_tag: 'nvhpc' exe: nvc++ versions: - 24.7: { stds: [11, 14, 17, 20 ] } + 24.7: { stds: [17, 20 ] } # Jobs support the following properties: # @@ -234,10 +233,10 @@ jobs: projects: cccl: name: 'CCCL' - stds: [11, 14, 17, 20] + stds: [17, 20] libcudacxx: name: 'libcu++' - stds: [11, 14, 17, 20] + stds: [17, 20] cub: name: 'CUB' stds: [17, 20] diff --git a/ci/test_python.sh b/ci/test_python.sh index bd66cc57716..34900fdb8e0 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -8,25 +8,28 @@ print_environment_details fail_if_no_gpu -readonly prefix="${BUILD_DIR}/python/" -export PYTHONPATH="${prefix}:${PYTHONPATH:-}" +begin_group "⚙️ Existing site-packages" +pip freeze +end_group "⚙️ Existing site-packages" -pushd ../python/cuda_cooperative >/dev/null +for module in cuda_parallel cuda_cooperative; do -run_command "⚙️ Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test] -run_command "🚀 Pytest cuda_cooperative" python -m pytest -v ./tests + pushd "../python/${module}" >/dev/null -popd >/dev/null + TEMP_VENV_DIR="/tmp/${module}_venv" + rm -rf "${TEMP_VENV_DIR}" + python -m venv "${TEMP_VENV_DIR}" + . "${TEMP_VENV_DIR}/bin/activate" + echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt + run_command "⚙️ Pip install ${module}" pip install -c /tmp/cuda-cccl_constraints.txt .[test] + begin_group "⚙️ ${module} site-packages" + pip freeze + end_group "⚙️ ${module} site-packages" + run_command "🚀 Pytest ${module}" python -m pytest -v ./tests + deactivate -pushd ../python/cuda_parallel >/dev/null + popd >/dev/null -# Temporarily install the package twice to populate include directory as part of the first installation -# and to let manifest discover these includes during the second installation. Do not forget to remove the -# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed. -run_command "⚙️ Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test] -run_command "⚙️ Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test] -run_command "🚀 Pytest cuda_parallel" python -m pytest -v ./tests - -popd >/dev/null +done print_time_summary diff --git a/ci/update_version.sh b/ci/update_version.sh index c43303449bb..6a25a837d50 100755 --- a/ci/update_version.sh +++ b/ci/update_version.sh @@ -37,6 +37,7 @@ CUB_CMAKE_VERSION_FILE="lib/cmake/cub/cub-config-version.cmake" LIBCUDACXX_CMAKE_VERSION_FILE="lib/cmake/libcudacxx/libcudacxx-config-version.cmake" THRUST_CMAKE_VERSION_FILE="lib/cmake/thrust/thrust-config-version.cmake" CUDAX_CMAKE_VERSION_FILE="lib/cmake/cudax/cudax-config-version.cmake" +CUDA_CCCL_VERSION_FILE="python/cuda_cccl/cuda/cccl/_version.py" CUDA_COOPERATIVE_VERSION_FILE="python/cuda_cooperative/cuda/cooperative/_version.py" CUDA_PARALLEL_VERSION_FILE="python/cuda_parallel/cuda/parallel/_version.py" @@ -110,6 +111,7 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" " update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)" update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)" +update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$major.$minor.$patch\"" update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\"" update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\"" diff --git a/cmake/CCCLGetDependencies.cmake b/cmake/CCCLGetDependencies.cmake index cd7f3c9fc41..1a97d98820b 100644 --- a/cmake/CCCLGetDependencies.cmake +++ b/cmake/CCCLGetDependencies.cmake @@ -14,7 +14,7 @@ endmacro() macro(cccl_get_catch2) include("${_cccl_cpm_file}") - CPMAddPackage("gh:catchorg/Catch2@2.13.9") + CPMAddPackage("gh:catchorg/Catch2@3.8.0") endmacro() macro(cccl_get_fmt) diff --git a/cub/benchmarks/bench/merge_sort/keys.cu b/cub/benchmarks/bench/merge_sort/keys.cu index e1d7d165a79..9cca06463bb 100644 --- a/cub/benchmarks/bench/merge_sort/keys.cu +++ b/cub/benchmarks/bench/merge_sort/keys.cu @@ -25,6 +25,7 @@ * ******************************************************************************/ +#include #include #include @@ -84,7 +85,7 @@ void keys(nvbench::state& state, nvbench::type_list) using value_input_it_t = value_t*; using key_it_t = key_t*; using value_it_t = value_t*; - using offset_t = OffsetT; + using offset_t = cub::detail::choose_offset_t; using compare_op_t = less_t; #if !TUNE_BASE diff --git a/cub/benchmarks/bench/merge_sort/pairs.cu b/cub/benchmarks/bench/merge_sort/pairs.cu index f0238063efe..7b54cc49863 100644 --- a/cub/benchmarks/bench/merge_sort/pairs.cu +++ b/cub/benchmarks/bench/merge_sort/pairs.cu @@ -25,6 +25,7 @@ * ******************************************************************************/ +#include #include #include @@ -81,7 +82,7 @@ void pairs(nvbench::state& state, nvbench::type_list) using value_input_it_t = value_t*; using key_it_t = key_t*; using value_it_t = value_t*; - using offset_t = OffsetT; + using offset_t = cub::detail::choose_offset_t; using compare_op_t = less_t; #if !TUNE_BASE diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index fcd81e660f6..0da5f982561 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -86,10 +86,10 @@ void init_output_partition_buffer( FlagsItT d_flags, OffsetT num_items, T* d_out, - cub::detail::partition_distinct_output_t& d_partition_out_buffer) + cub::detail::select::partition_distinct_output_t& d_partition_out_buffer) { const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true); - d_partition_out_buffer = cub::detail::partition_distinct_output_t{d_out, d_out + selected_elements}; + d_partition_out_buffer = cub::detail::select::partition_distinct_output_t{d_out, d_out + selected_elements}; } template @@ -109,7 +109,7 @@ void flagged(nvbench::state& state, nvbench::type_list, T*>::type; + conditional, T*>::type; #if !TUNE_BASE using policy_t = policy_hub_t; diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu index d456e65fc1c..337586d7f94 100644 --- a/cub/benchmarks/bench/partition/if.cu +++ b/cub/benchmarks/bench/partition/if.cu @@ -112,10 +112,10 @@ void init_output_partition_buffer( OffsetT num_items, T* d_out, SelectOpT select_op, - cub::detail::partition_distinct_output_t& d_partition_out_buffer) + cub::detail::select::partition_distinct_output_t& d_partition_out_buffer) { const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op); - d_partition_out_buffer = cub::detail::partition_distinct_output_t{d_out, d_out + selected_elements}; + d_partition_out_buffer = cub::detail::select::partition_distinct_output_t{d_out, d_out + selected_elements}; } template @@ -135,7 +135,7 @@ void partition(nvbench::state& state, nvbench::type_list, T*>::type; + conditional, T*>::type; #if !TUNE_BASE using policy_t = policy_hub_t; diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu index b6b9e4fd537..bd04bcf3d43 100644 --- a/cub/benchmarks/bench/radix_sort/keys.cu +++ b/cub/benchmarks/bench/radix_sort/keys.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -109,7 +110,8 @@ constexpr std::size_t max_onesweep_temp_storage_size() using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; using hist_agent = cub::AgentRadixSortHistogram; - return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); + return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), + sizeof(typename hist_agent::TempStorage)); } template diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu index 4a9f229bca4..35d589f453e 100644 --- a/cub/benchmarks/bench/radix_sort/pairs.cu +++ b/cub/benchmarks/bench/radix_sort/pairs.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -107,7 +108,8 @@ constexpr std::size_t max_onesweep_temp_storage_size() using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; using hist_agent = cub::AgentRadixSortHistogram; - return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); + return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), + sizeof(typename hist_agent::TempStorage)); } template diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index 9de575d0686..579d3757d3c 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list) }); } -NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types)) +NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); diff --git a/cub/benchmarks/bench/reduce/max.cu b/cub/benchmarks/bench/reduce/custom.cu similarity index 81% rename from cub/benchmarks/bench/reduce/max.cu rename to cub/benchmarks/bench/reduce/custom.cu index 791d5bfe167..0203ef60b8c 100644 --- a/cub/benchmarks/bench/reduce/max.cu +++ b/cub/benchmarks/bench/reduce/custom.cu @@ -25,11 +25,18 @@ * ******************************************************************************/ +// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific +// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path. + +// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. Its main +// use is to detect regressions. + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = max_t; +using value_types = all_types; +using op_t = max_t; #include "base.cuh" diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu index 177d7628f6f..50b175f4ca8 100644 --- a/cub/benchmarks/bench/reduce/min.cu +++ b/cub/benchmarks/bench/reduce/min.cu @@ -24,14 +24,23 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ -// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures. -// It specifically uses cuda::minimum<> instead of a user-defined operator. -#define TUNE_T int16_t + +// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<> +// instead of a user-defined operator, which CUB recognizes to select an optimized code path. + +// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<> +// Tuning parameters found for signed integer types apply equally for unsigned integer types +// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+? + +#include + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = ::cuda::minimum<>; +// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them. +using value_types = fundamental_types; +using op_t = ::cuda::minimum<>; #include "base.cuh" diff --git a/cub/benchmarks/bench/reduce/sum.cu b/cub/benchmarks/bench/reduce/sum.cu index 4433724f090..ab65d7fe847 100644 --- a/cub/benchmarks/bench/reduce/sum.cu +++ b/cub/benchmarks/bench/reduce/sum.cu @@ -25,11 +25,18 @@ * ******************************************************************************/ +// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses +// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path. + +// Tuning parameters found for signed integer types apply equally for unsigned integer types + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = ::cuda::std::plus<>; +// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them. +using value_types = all_types; +using op_t = ::cuda::std::plus<>; #include "base.cuh" diff --git a/cub/benchmarks/nvbench_helper/CMakeLists.txt b/cub/benchmarks/nvbench_helper/CMakeLists.txt index 24b12c12154..bf8581fbf79 100644 --- a/cub/benchmarks/nvbench_helper/CMakeLists.txt +++ b/cub/benchmarks/nvbench_helper/CMakeLists.txt @@ -26,10 +26,9 @@ if (CUB_ENABLE_NVBENCH_HELPER_TESTS) test/gen_range.cu test/gen_entropy.cu test/gen_uniform_distribution.cu - test/gen_power_law_distribution.cu - test/main.cpp) + test/gen_power_law_distribution.cu) cccl_configure_target(${nvbench_helper_test_target} DIALECT 17) - target_link_libraries(${nvbench_helper_test_target} PRIVATE nvbench_helper Catch2::Catch2 Boost::math) + target_link_libraries(${nvbench_helper_test_target} PRIVATE nvbench_helper Catch2::Catch2WithMain Boost::math) if ("${device_system}" STREQUAL "cpp") target_compile_definitions(${nvbench_helper_test_target} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) endif() diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index e1928ec8516..8324650d044 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -1,5 +1,7 @@ #pragma once +#include + #include #include @@ -50,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant> namespace detail { -template +template struct push_back {}; -template -struct push_back> +template +struct push_back, Ts...> { - using type = nvbench::type_list; + using type = nvbench::type_list; }; } // namespace detail -template -using push_back_t = typename detail::push_back::type; +template +using push_back_t = typename detail::push_back::type; #ifdef TUNE_OffsetT using offset_types = nvbench::type_list; diff --git a/cub/benchmarks/nvbench_helper/test/gen_entropy.cu b/cub/benchmarks/nvbench_helper/test/gen_entropy.cu index 967b8ff0e88..12c96154f94 100644 --- a/cub/benchmarks/nvbench_helper/test/gen_entropy.cu +++ b/cub/benchmarks/nvbench_helper/test/gen_entropy.cu @@ -36,7 +36,8 @@ #include #include -#include +#include +#include #include template diff --git a/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu b/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu index 0d06d308b0b..599bb9293cb 100644 --- a/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu +++ b/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu @@ -33,7 +33,8 @@ #include #include -#include +#include +#include #include bool is_normal(thrust::host_vector data) diff --git a/cub/benchmarks/nvbench_helper/test/gen_range.cu b/cub/benchmarks/nvbench_helper/test/gen_range.cu index 064e0b2f1d2..f4eba3183b9 100644 --- a/cub/benchmarks/nvbench_helper/test/gen_range.cu +++ b/cub/benchmarks/nvbench_helper/test/gen_range.cu @@ -30,7 +30,8 @@ #include -#include +#include +#include #include using types = diff --git a/cub/benchmarks/nvbench_helper/test/gen_seed.cu b/cub/benchmarks/nvbench_helper/test/gen_seed.cu index 3f04b2c88d1..9f27d6931d5 100644 --- a/cub/benchmarks/nvbench_helper/test/gen_seed.cu +++ b/cub/benchmarks/nvbench_helper/test/gen_seed.cu @@ -28,7 +28,7 @@ #include #include -#include +#include #include using types = diff --git a/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu b/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu index ed09ef7535e..d37ba2b8fb6 100644 --- a/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu +++ b/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu @@ -34,7 +34,9 @@ #include #include -#include +#include +#include +#include #include template diff --git a/cub/benchmarks/nvbench_helper/test/main.cpp b/cub/benchmarks/nvbench_helper/test/main.cpp deleted file mode 100644 index 5dc819f5caa..00000000000 --- a/cub/benchmarks/nvbench_helper/test/main.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#define CATCH_CONFIG_MAIN -#include diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh index 37e1a013193..c19cb90079a 100644 --- a/cub/cub/agent/agent_adjacent_difference.cuh +++ b/cub/cub/agent/agent_adjacent_difference.cuh @@ -63,6 +63,11 @@ struct AgentAdjacentDifferencePolicy static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; +namespace detail +{ +namespace adjacent_difference +{ + template +using AgentDifference CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = detail::adjacent_difference:: + AgentDifference; + +template +using AgentDifferenceInit CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::adjacent_difference::AgentDifferenceInit; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh index f9d5e8b16a1..2b926f582fe 100644 --- a/cub/cub/agent/agent_batch_memcpy.cuh +++ b/cub/cub/agent/agent_batch_memcpy.cuh @@ -60,6 +60,8 @@ CUB_NAMESPACE_BEGIN namespace detail { +namespace batch_memcpy +{ template _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint4& data_out) @@ -834,7 +836,7 @@ private: BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage) .ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op); } - CTA_SYNC(); + __syncthreads(); // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration) blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD; @@ -996,7 +998,7 @@ private: // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused // with next iteration - CTA_SYNC(); + __syncthreads(); } } @@ -1026,7 +1028,7 @@ public: } // Ensure we can repurpose the BlockLoad's temporary storage - CTA_SYNC(); + __syncthreads(); // Count how many buffers fall into each size-class VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes); @@ -1037,7 +1039,7 @@ public: .ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg); // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids - CTA_SYNC(); + __syncthreads(); // Factor in the per-size-class counts / offsets // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset @@ -1077,7 +1079,7 @@ public: // Ensure the prefix callback has finished using its temporary storage and that it can be reused // in the next stage - CTA_SYNC(); + __syncthreads(); // Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their // size @@ -1085,7 +1087,7 @@ public: // Ensure all buffers have been partitioned by their size class AND // ensure that blev_buffer_offset has been written to shared memory - CTA_SYNC(); + __syncthreads(); // TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem InputBufferIt tile_buffer_srcs = input_buffer_it + buffer_offset; @@ -1104,7 +1106,7 @@ public: tile_id); // Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers - CTA_SYNC(); + __syncthreads(); // Copy warp-level buffers BatchMemcpyWLEVBuffers( @@ -1172,7 +1174,7 @@ private: // buffers BLevBlockOffsetTileState blev_block_scan_state; }; - +} // namespace batch_memcpy } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index e454dc837b1..2e98bf76771 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -134,6 +134,11 @@ struct AgentHistogramPolicy * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace histogram +{ + /** * @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating * in device-wide histogram . @@ -320,7 +325,7 @@ struct AgentHistogram } // Barrier to make sure all threads are done updating counters - CTA_SYNC(); + __syncthreads(); } // Initialize privatized bin counters. Specialized for privatized shared-memory counters @@ -350,7 +355,7 @@ struct AgentHistogram _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) { // Barrier to make sure all threads are done updating counters - CTA_SYNC(); + __syncthreads(); // Apply privatized bin counts to output bin counts #pragma unroll @@ -690,7 +695,7 @@ struct AgentHistogram ConsumeTile(tile_offset, TILE_SAMPLES); } - CTA_SYNC(); + __syncthreads(); // Get next tile if (threadIdx.x == 0) @@ -698,7 +703,7 @@ struct AgentHistogram temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; } - CTA_SYNC(); + __syncthreads(); tile_idx = temp_storage.tile_idx; } @@ -914,4 +919,31 @@ struct AgentHistogram } }; +} // namespace histogram +} // namespace detail + +template +using AgentHistogram CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::histogram::AgentHistogram< + AgentHistogramPolicyT, + PRIVATIZED_SMEM_BINS, + NUM_CHANNELS, + NUM_ACTIVE_CHANNELS, + SampleIteratorT, + CounterT, + PrivatizedDecodeOpT, + OutputDecodeOpT, + OffsetT, + LEGACY_PTX_ARCH>; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh index ae457bb954d..9ae14c3e42e 100644 --- a/cub/cub/agent/agent_merge.cuh +++ b/cub/cub/agent/agent_merge.cuh @@ -22,7 +22,8 @@ #include -#include +#include +#include CUB_NAMESPACE_BEGIN namespace detail @@ -116,7 +117,7 @@ struct agent_t const Offset partition_end = merge_partitions[tile_idx + 1]; const Offset diag0 = items_per_tile * tile_idx; - const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile); + const Offset diag1 = (::cuda::std::min)(keys1_count + keys2_count, diag0 + items_per_tile); // compute bounding box for keys1 & keys2 const Offset keys1_beg = partition_beg; @@ -129,14 +130,14 @@ struct agent_t const int num_keys2 = static_cast(keys2_end - keys2_beg); key_type keys_loc[items_per_thread]; - gmem_to_reg( + merge_sort::gmem_to_reg( keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2); - reg_to_shared(&storage.keys_shared[0], keys_loc); - CTA_SYNC(); + merge_sort::reg_to_shared(&storage.keys_shared[0], keys_loc); + __syncthreads(); // use binary search in shared memory to find merge path for each of thread. // we can use int type here, because the number of items in shared memory is limited - const int diag0_loc = min(num_keys1 + num_keys2, items_per_thread * threadIdx.x); + const int diag0_loc = (::cuda::std::min)(num_keys1 + num_keys2, static_cast(items_per_thread * threadIdx.x)); const int keys1_beg_loc = MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op); @@ -158,7 +159,7 @@ struct agent_t keys_loc, indices, compare_op); - CTA_SYNC(); + __syncthreads(); // write keys if (IsFullTile) @@ -180,11 +181,12 @@ struct agent_t #endif // _CCCL_CUDACC_AT_LEAST(11, 8) { item_type items_loc[items_per_thread]; - gmem_to_reg( + merge_sort::gmem_to_reg( items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2); - CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it - reg_to_shared(&storage.items_shared[0], items_loc); - CTA_SYNC(); + __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write + // to it + merge_sort::reg_to_shared(&storage.items_shared[0], items_loc); + __syncthreads(); // gather items from shared mem #pragma unroll @@ -192,7 +194,7 @@ struct agent_t { items_loc[i] = storage.items_shared[indices[i]]; } - CTA_SYNC(); + __syncthreads(); // write from reg to gmem if (IsFullTile) @@ -214,7 +216,7 @@ struct agent_t const Offset tile_base = tile_idx * items_per_tile; // TODO(bgruber): random mixing of int and Offset const int items_in_tile = - static_cast(cub::min(static_cast(items_per_tile), keys1_count + keys2_count - tile_base)); + static_cast((::cuda::std::min)(static_cast(items_per_tile), keys1_count + keys2_count - tile_base)); if (items_in_tile == items_per_tile) { consume_tile(tile_idx, tile_base, items_per_tile); // full tile diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index dd8b559f2c4..4c74b73baf2 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -45,6 +45,9 @@ #include +#include +#include + CUB_NAMESPACE_BEGIN template (blockIdx.x); auto num_tiles = static_cast(gridDim.x); auto tile_base = tile_idx * ITEMS_PER_TILE; - int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE}); + int items_in_tile = (::cuda::std::min)(static_cast(keys_count - tile_base), int{ITEMS_PER_TILE}); if (tile_idx < num_tiles - 1) { @@ -187,7 +194,7 @@ struct AgentBlockSort BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local); } - CTA_SYNC(); + __syncthreads(); } KeyT keys_local[ITEMS_PER_THREAD]; @@ -200,7 +207,7 @@ struct AgentBlockSort BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local); } - CTA_SYNC(); + __syncthreads(); _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); _CCCL_IF_CONSTEXPR (IS_LAST_TILE) @@ -212,7 +219,7 @@ struct AgentBlockSort BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op); } - CTA_SYNC(); + __syncthreads(); if (ping) { @@ -227,7 +234,7 @@ struct AgentBlockSort _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { - CTA_SYNC(); + __syncthreads(); _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { @@ -252,7 +259,7 @@ struct AgentBlockSort _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { - CTA_SYNC(); + __syncthreads(); _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { @@ -335,10 +342,10 @@ struct AgentPartition // partition_idx / target_merged_tiles_number const OffsetT local_tile_idx = mask & partition_idx; - const OffsetT keys1_beg = (cub::min)(keys_count, start); - const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size)); + const OffsetT keys1_beg = (::cuda::std::min)(keys_count, start); + const OffsetT keys1_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(start, size)); const OffsetT keys2_beg = keys1_end; - const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size)); + const OffsetT keys2_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size)); _CCCL_PDL_GRID_DEPENDENCY_SYNC(); @@ -349,7 +356,7 @@ struct AgentPartition } else { - const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); + const OffsetT partition_at = (::cuda::std::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); OffsetT partition_diag = ping @@ -371,8 +378,6 @@ struct AgentPartition } }; -namespace detail -{ /** * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array * @@ -418,7 +423,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PE output[idx] = input[item]; } } -} // namespace detail /// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays. template = keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2) // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile // and keys1_end is key1's component of that path - const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg); - OffsetT keys2_end = - (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast(ITEMS_PER_TILE)) - keys1_end); + const OffsetT keys2_beg = (::cuda::std::min)(max_keys2, diag - keys1_beg); + OffsetT keys2_end = (::cuda::std::min)( + max_keys2, detail::safe_add_bound_to_max(diag, static_cast(ITEMS_PER_TILE)) - keys1_end); // Check if it's the last tile in the tile group being merged if (mask == (mask & tile_idx)) { - keys1_end = (cub::min)(keys_count - start, size); - keys2_end = (cub::min)(max_keys2, size); + keys1_end = (::cuda::std::min)(keys_count - start, size); + keys2_end = (::cuda::std::min)(max_keys2, size); } // number of keys per tile @@ -547,15 +551,15 @@ struct AgentMerge KeyT keys_local[ITEMS_PER_THREAD]; if (ping) { - detail::gmem_to_reg( + gmem_to_reg( keys_local, keys_in_ping + start + keys1_beg, keys_in_ping + start + size + keys2_beg, num_keys1, num_keys2); } else { - detail::gmem_to_reg( + gmem_to_reg( keys_local, keys_in_pong + start + keys1_beg, keys_in_pong + start + size + keys2_beg, num_keys1, num_keys2); } - detail::reg_to_shared(&storage.keys_shared[0], keys_local); + reg_to_shared(&storage.keys_shared[0], keys_local); // preload items into registers already // @@ -565,7 +569,7 @@ struct AgentMerge { if (ping) { - detail::gmem_to_reg( + gmem_to_reg( items_local, items_in_ping + start + keys1_beg, items_in_ping + start + size + keys2_beg, @@ -574,7 +578,7 @@ struct AgentMerge } else { - detail::gmem_to_reg( + gmem_to_reg( items_local, items_in_pong + start + keys1_beg, items_in_pong + start + size + keys2_beg, @@ -583,7 +587,7 @@ struct AgentMerge } } - CTA_SYNC(); + __syncthreads(); _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // use binary search in shared memory @@ -591,7 +595,7 @@ struct AgentMerge // we can use int type here, because the number of // items in shared memory is limited // - const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); + const int diag0_local = (::cuda::std::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); const int keys1_beg_local = MergePath( &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op); @@ -616,7 +620,7 @@ struct AgentMerge indices, compare_op); - CTA_SYNC(); + __syncthreads(); // write keys if (ping) @@ -650,11 +654,11 @@ struct AgentMerge _CCCL_IF_CONSTEXPR (!KEYS_ONLY) #endif // _CCCL_CUDACC_AT_LEAST(11, 8) { - CTA_SYNC(); + __syncthreads(); - detail::reg_to_shared(&storage.items_shared[0], items_local); + reg_to_shared(&storage.items_shared[0], items_local); - CTA_SYNC(); + __syncthreads(); // gather items from shared mem // @@ -664,7 +668,7 @@ struct AgentMerge items_local[item] = storage.items_shared[indices[item]]; } - CTA_SYNC(); + __syncthreads(); // write from reg to gmem // @@ -731,7 +735,7 @@ struct AgentMerge const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE; const int tid = static_cast(threadIdx.x); const int items_in_tile = - static_cast((cub::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); + static_cast((::cuda::std::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); if (tile_idx < num_tiles - 1) { @@ -744,4 +748,45 @@ struct AgentMerge } }; +} // namespace merge_sort +} // namespace detail + +template +using AgentBlockSort CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::merge_sort::AgentBlockSort< + Policy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>; + +template +using AgentPartition CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::merge_sort::AgentPartition; + +template +using AgentMerge CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface " + "will be removed.") = + detail::merge_sort::AgentMerge; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh index 43562c9a2b5..cc6e5c18f11 100644 --- a/cub/cub/agent/agent_radix_sort_downsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh @@ -124,6 +124,11 @@ struct AgentRadixSortDownsweepPolicy : ScalingType * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace radix_sort +{ + /** * @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in * device-wide radix sort downsweep . @@ -148,14 +153,14 @@ template + typename DecomposerT = identity_decomposer_t> struct AgentRadixSortDownsweep { //--------------------------------------------------------------------- // Type definitions and constants //--------------------------------------------------------------------- - using traits = detail::radix::traits_t; + using traits = radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; @@ -182,8 +187,7 @@ struct AgentRadixSortDownsweep using ValuesItr = CacheModifiedInputIterator; // Radix ranking type to use - using BlockRadixRankT = - cub::detail::block_radix_rank_t; + using BlockRadixRankT = block_radix_rank_t; // Digit extractor type using fundamental_digit_extractor_t = BFEDigitExtractor; @@ -277,7 +281,7 @@ struct AgentRadixSortDownsweep temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -305,7 +309,7 @@ struct AgentRadixSortDownsweep int (&ranks)[ITEMS_PER_THREAD], OffsetT valid_items) { - CTA_SYNC(); + __syncthreads(); ValueExchangeT& exchange_values = temp_storage.exchange_values.Alias(); @@ -315,7 +319,7 @@ struct AgentRadixSortDownsweep exchange_values[ranks[ITEM]] = values[ITEM]; } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -342,7 +346,7 @@ struct AgentRadixSortDownsweep { BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); - CTA_SYNC(); + __syncthreads(); } /** @@ -362,7 +366,7 @@ struct AgentRadixSortDownsweep BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items, oob_item); - CTA_SYNC(); + __syncthreads(); } /** @@ -409,7 +413,7 @@ struct AgentRadixSortDownsweep { BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values); - CTA_SYNC(); + __syncthreads(); } /** @@ -428,7 +432,7 @@ struct AgentRadixSortDownsweep BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items); - CTA_SYNC(); + __syncthreads(); } /** @@ -474,7 +478,7 @@ struct AgentRadixSortDownsweep { ValueT values[ITEMS_PER_THREAD]; - CTA_SYNC(); + __syncthreads(); LoadValues(values, block_offset, valid_items, Int2Type(), Int2Type()); @@ -520,7 +524,7 @@ struct AgentRadixSortDownsweep int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; BlockRadixRankT(temp_storage.radix_rank).RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix); - CTA_SYNC(); + __syncthreads(); // Share exclusive digit prefix #pragma unroll @@ -534,7 +538,7 @@ struct AgentRadixSortDownsweep } } - CTA_SYNC(); + __syncthreads(); // Get inclusive digit prefix int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; @@ -562,7 +566,7 @@ struct AgentRadixSortDownsweep } } - CTA_SYNC(); + __syncthreads(); // Update global scatter base offsets for each digit #pragma unroll @@ -577,7 +581,7 @@ struct AgentRadixSortDownsweep } } - CTA_SYNC(); + __syncthreads(); // Scatter keys ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); @@ -602,7 +606,7 @@ struct AgentRadixSortDownsweep T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items); - CTA_SYNC(); + __syncthreads(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items); block_offset += TILE_ITEMS; @@ -616,7 +620,7 @@ struct AgentRadixSortDownsweep T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); - CTA_SYNC(); + __syncthreads(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); } } @@ -670,7 +674,7 @@ struct AgentRadixSortDownsweep } } - short_circuit = CTA_SYNC_AND(short_circuit); + short_circuit = __syncthreads_and(short_circuit); } /** @@ -719,7 +723,7 @@ struct AgentRadixSortDownsweep } } - short_circuit = CTA_SYNC_AND(short_circuit); + short_circuit = __syncthreads_and(short_circuit); } /** @@ -744,7 +748,7 @@ struct AgentRadixSortDownsweep ProcessTile(block_offset); block_offset += TILE_ITEMS; - CTA_SYNC(); + __syncthreads(); } // Clean up last partial tile with guarded-I/O @@ -756,4 +760,18 @@ struct AgentRadixSortDownsweep } }; +} // namespace radix_sort +} // namespace detail + +template +using AgentRadixSortDownsweep CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public " + "interface will be removed.") = detail::radix_sort:: + AgentRadixSortDownsweep; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh index 2785f732450..29580897764 100644 --- a/cub/cub/agent/agent_radix_sort_histogram.cuh +++ b/cub/cub/agent/agent_radix_sort_histogram.cuh @@ -50,6 +50,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN template @@ -79,11 +81,16 @@ struct AgentRadixSortExclusiveSumPolicy }; }; +namespace detail +{ +namespace radix_sort +{ + template + typename DecomposerT = identity_decomposer_t> struct AgentRadixSortHistogram { // constants @@ -98,7 +105,7 @@ struct AgentRadixSortHistogram NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS, }; - using traits = detail::radix::traits_t; + using traits = radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; @@ -172,7 +179,7 @@ struct AgentRadixSortHistogram } } } - CTA_SYNC(); + __syncthreads(); } _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) @@ -199,7 +206,7 @@ struct AgentRadixSortHistogram _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) { - int part = LaneId() % NUM_PARTS; + int part = ::cuda::ptx::get_sreg_laneid() % NUM_PARTS; #pragma unroll for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { @@ -247,7 +254,7 @@ struct AgentRadixSortHistogram { // Reset the counters. Init(); - CTA_SYNC(); + __syncthreads(); // Process the tiles. OffsetT portion_offset = portion * MAX_PORTION_SIZE; @@ -259,11 +266,11 @@ struct AgentRadixSortHistogram LoadTileKeys(tile_offset, keys); AccumulateSharedHistograms(tile_offset, keys); } - CTA_SYNC(); + __syncthreads(); // Accumulate the result in global memory. AccumulateGlobalHistograms(); - CTA_SYNC(); + __syncthreads(); } } @@ -273,4 +280,17 @@ struct AgentRadixSortHistogram } }; +} // namespace radix_sort +} // namespace detail + +template +using AgentRadixSortHistogram CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::radix_sort::AgentRadixSortHistogram; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh index a78ee66c7b2..331012d36b9 100644 --- a/cub/cub/agent/agent_radix_sort_onesweep.cuh +++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh @@ -49,6 +49,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -96,13 +97,18 @@ struct AgentRadixSortOnesweepPolicy : ScalingType static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; +namespace detail +{ +namespace radix_sort +{ + template + typename DecomposerT = identity_decomposer_t> struct AgentRadixSortOnesweep { // constants @@ -126,7 +132,7 @@ struct AgentRadixSortOnesweep LOOKBACK_VALUE_MASK = ~LOOKBACK_KIND_MASK, }; - using traits = detail::radix::traits_t; + using traits = radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; @@ -279,7 +285,7 @@ struct AgentRadixSortOnesweep } while (value_j == 0); inc_sum += value_j & LOOKBACK_VALUE_MASK; - want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask); + want_mask = __ballot_sync(want_mask, (value_j & LOOKBACK_GLOBAL_MASK) == 0); if (value_j & LOOKBACK_GLOBAL_MASK) { break; @@ -349,7 +355,7 @@ struct AgentRadixSortOnesweep short_circuit = short_circuit || bins[u] == TILE_ITEMS; } } - short_circuit = CTA_SYNC_OR(short_circuit); + short_circuit = __syncthreads_or(short_circuit); if (!short_circuit) { return; @@ -377,7 +383,7 @@ struct AgentRadixSortOnesweep LoadBinsToOffsetsGlobal(offsets); LookbackGlobal(bins); UpdateBinsGlobal(bins, offsets); - CTA_SYNC(); + __syncthreads(); // scatter the keys OffsetT global_offset = s.global_offsets[common_bin]; @@ -483,7 +489,7 @@ struct AgentRadixSortOnesweep { d_keys_out[global_idx] = Twiddle::Out(key, decomposer); } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); } } @@ -501,7 +507,7 @@ struct AgentRadixSortOnesweep { d_values_out[global_idx] = value; } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); } } @@ -527,7 +533,7 @@ struct AgentRadixSortOnesweep { num_writes -= int(global_idx + 1) % ALIGN; } - num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK); + num_writes = __shfl_sync(WARP_MASK, num_writes, last_lane); if (lane < num_writes) { ThreadStore(&d_keys_out[global_idx], key_out); @@ -600,10 +606,10 @@ struct AgentRadixSortOnesweep LoadValues(block_idx * TILE_ITEMS, values); // scatter values - CTA_SYNC(); + __syncthreads(); ScatterValuesShared(values, ranks); - CTA_SYNC(); + __syncthreads(); ScatterValuesGlobal(digits); } @@ -625,7 +631,7 @@ struct AgentRadixSortOnesweep .RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix, CountsCallback(*this, bins, keys)); // scatter keys in shared memory - CTA_SYNC(); + __syncthreads(); ScatterKeysShared(keys, ranks); // compute global offsets @@ -634,7 +640,7 @@ struct AgentRadixSortOnesweep UpdateBinsGlobal(bins, exclusive_digit_prefix); // scatter keys in global memory - CTA_SYNC(); + __syncthreads(); ScatterKeysGlobal(); // scatter values if necessary @@ -669,7 +675,7 @@ struct AgentRadixSortOnesweep , current_bit(current_bit) , num_bits(num_bits) , warp(threadIdx.x / WARP_THREADS) - , lane(LaneId()) + , lane(::cuda::ptx::get_sreg_laneid()) , decomposer(decomposer) { // initialization @@ -677,10 +683,24 @@ struct AgentRadixSortOnesweep { s.block_idx = atomicAdd(d_ctrs, 1); } - CTA_SYNC(); + __syncthreads(); block_idx = s.block_idx; full_block = (block_idx + 1) * TILE_ITEMS <= num_items; } }; +} // namespace radix_sort +} // namespace detail + +template +using AgentRadixSortOnesweep CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = detail::radix_sort:: + AgentRadixSortOnesweep; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh index e91e32c5bd3..cc0c10464f3 100644 --- a/cub/cub/agent/agent_radix_sort_upsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh @@ -52,6 +52,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /****************************************************************************** @@ -98,6 +100,11 @@ struct AgentRadixSortUpsweepPolicy : ScalingType * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace radix_sort +{ + /** * @brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for * participating in device-wide radix sort upsweep . @@ -108,19 +115,19 @@ struct AgentRadixSortUpsweepPolicy : ScalingType * @tparam KeyT * KeyT type * - * @tparam DecomposerT = detail::identity_decomposer_t + * @tparam DecomposerT = identity_decomposer_t * Signed integer type for global offsets */ template + typename DecomposerT = identity_decomposer_t> struct AgentRadixSortUpsweep { //--------------------------------------------------------------------- // Type definitions and constants //--------------------------------------------------------------------- - using traits = detail::radix::traits_t; + using traits = radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; @@ -298,7 +305,7 @@ struct AgentRadixSortUpsweep _CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts() { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); + unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid(); #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) @@ -331,7 +338,7 @@ struct AgentRadixSortUpsweep LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); // Prevent hoisting - CTA_SYNC(); + __syncthreads(); // Bucket tile of keys Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); @@ -385,12 +392,12 @@ struct AgentRadixSortUpsweep block_offset += TILE_ITEMS; } - CTA_SYNC(); + __syncthreads(); // Aggregate back into local_count registers to prevent overflow UnpackDigitCounts(); - CTA_SYNC(); + __syncthreads(); // Reset composite counters in lanes ResetDigitCounters(); @@ -406,7 +413,7 @@ struct AgentRadixSortUpsweep // Process partial tile if necessary ProcessPartialTile(block_offset, block_end); - CTA_SYNC(); + __syncthreads(); // Aggregate back into local_count registers UnpackDigitCounts(); @@ -419,7 +426,7 @@ struct AgentRadixSortUpsweep _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); + unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid(); // Place unpacked digit counters in shared memory #pragma unroll @@ -440,7 +447,7 @@ struct AgentRadixSortUpsweep } } - CTA_SYNC(); + __syncthreads(); // Rake-reduce bin_count reductions @@ -499,7 +506,7 @@ struct AgentRadixSortUpsweep _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); + unsigned int warp_tid = ::cuda::ptx::get_sreg_laneid(); // Place unpacked digit counters in shared memory #pragma unroll @@ -520,7 +527,7 @@ struct AgentRadixSortUpsweep } } - CTA_SYNC(); + __syncthreads(); // Rake-reduce bin_count reductions #pragma unroll @@ -542,4 +549,15 @@ struct AgentRadixSortUpsweep } }; +} // namespace radix_sort +} // namespace detail + +template +using AgentRadixSortUpsweep CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::radix_sort::AgentRadixSortUpsweep; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index d5e3514f369..35779d0e8a6 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -95,6 +95,11 @@ struct AgentReducePolicy : ScalingType * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace reduce +{ + /** * @brief AgentReduce implements a stateful abstraction of CUDA thread blocks * for participating in device-wide reduction . @@ -136,7 +141,7 @@ struct AgentReduce //--------------------------------------------------------------------- /// The input value type - using InputT = cub::detail::value_t; + using InputT = value_t; /// Vector type of InputT for data movement using VectorT = typename CubVector::Type; @@ -249,8 +254,7 @@ struct AgentReduce AccumT items[ITEMS_PER_THREAD]; // Load items in striped fashion - cub::detail::load_transform_direct_striped( - threadIdx.x, d_wrapped_in + block_offset, items, transform_op); + load_transform_direct_striped(threadIdx.x, d_wrapped_in + block_offset, items, transform_op); // Reduce items within each thread stripe thread_aggregate = (IS_FIRST_TILE) ? cub::ThreadReduce(items, reduction_op) @@ -445,4 +449,18 @@ private: } }; +} // namespace reduce +} // namespace detail + +template +using AgentReduce CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface " + "will be removed.") = detail::reduce:: + AgentReduce; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index 735993723d8..ac0d9045ab9 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -116,6 +116,11 @@ struct AgentReduceByKeyPolicy * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace reduce +{ + /** * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduce-value-by-key @@ -167,13 +172,13 @@ struct AgentReduceByKey //--------------------------------------------------------------------- // The input keys type - using KeyInputT = cub::detail::value_t; + using KeyInputT = value_t; // The output keys type - using KeyOutputT = cub::detail::non_void_value_t; + using KeyOutputT = non_void_value_t; // The input values type - using ValueInputT = cub::detail::value_t; + using ValueInputT = value_t; // Tuple type for scanning (pairs accumulated segment-value with // segment-index) @@ -426,7 +431,7 @@ struct AgentReduceByKey OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) { - CTA_SYNC(); + __syncthreads(); // Compact and scatter pairs #pragma unroll @@ -438,7 +443,7 @@ struct AgentReduceByKey } } - CTA_SYNC(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) { @@ -539,7 +544,7 @@ struct AgentReduceByKey tile_predecessor = (tile_idx == 0) ? keys[0] : d_keys_in[tile_offset - 1]; } - CTA_SYNC(); + __syncthreads(); // Load values if (IS_LAST_TILE) @@ -551,7 +556,7 @@ struct AgentReduceByKey BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); } - CTA_SYNC(); + __syncthreads(); // Initialize head-flags and shuffle up the previous keys if (IS_LAST_TILE) @@ -694,4 +699,31 @@ struct AgentReduceByKey } }; +} // namespace reduce +} // namespace detail + +template +using AgentReduceByKey CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::reduce::AgentReduceByKey< + AgentReduceByKeyPolicyT, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT, + AccumT>; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh index 2495d2f5f7a..f8898fa4281 100644 --- a/cub/cub/agent/agent_rle.cuh +++ b/cub/cub/agent/agent_rle.cuh @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -133,6 +134,11 @@ struct AgentRlePolicy * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace rle +{ + /** * @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide * run-length-encode @@ -465,7 +471,7 @@ struct AgentRle { // Perform warpscans unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); + int lane_id = ::cuda::ptx::get_sreg_laneid(); LengthOffsetPair identity; identity.key = 0; @@ -501,7 +507,7 @@ struct AgentRle temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive; } - CTA_SYNC(); + __syncthreads(); // Accumulate total selected and the warp-wide prefix @@ -531,7 +537,7 @@ struct AgentRle // Ensure all threads have read warp aggregates before temp_storage is repurposed in the // subsequent scatter stage - CTA_SYNC(); + __syncthreads(); } //--------------------------------------------------------------------- @@ -551,7 +557,7 @@ struct AgentRle Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); + int lane_id = ::cuda::ptx::get_sreg_laneid(); // Locally compact items within the warp (first warp) if (warp_id == 0) @@ -564,7 +570,7 @@ struct AgentRle #pragma unroll for (int SLICE = 1; SLICE < WARPS; ++SLICE) { - CTA_SYNC(); + __syncthreads(); if (warp_id == SLICE) { @@ -608,7 +614,7 @@ struct AgentRle Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); + int lane_id = ::cuda::ptx::get_sreg_laneid(); // Unzip OffsetT run_offsets[ITEMS_PER_THREAD]; @@ -624,7 +630,7 @@ struct AgentRle WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]) .ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp); - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]) .ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp); @@ -762,7 +768,7 @@ struct AgentRle if (SYNC_AFTER_LOAD) { - CTA_SYNC(); + __syncthreads(); } // Set flags @@ -848,7 +854,7 @@ struct AgentRle if (SYNC_AFTER_LOAD) { - CTA_SYNC(); + __syncthreads(); } // Set flags @@ -878,7 +884,7 @@ struct AgentRle } } - CTA_SYNC(); + __syncthreads(); LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; @@ -989,4 +995,17 @@ struct AgentRle } }; +} // namespace rle +} // namespace detail + +template +using AgentRle CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface " + "will be removed.") = detail::rle:: + AgentRle; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh index 7021531d0cc..c3cc02b69a1 100644 --- a/cub/cub/agent/agent_scan.cuh +++ b/cub/cub/agent/agent_scan.cuh @@ -112,6 +112,11 @@ struct AgentScanPolicy : ScalingType * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace scan +{ + /** * @brief AgentScan implements a stateful abstraction of CUDA thread blocks for * participating in device-wide prefix scan. @@ -376,7 +381,7 @@ struct AgentScan BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } - CTA_SYNC(); + __syncthreads(); // Perform tile scan if (tile_idx == 0) @@ -397,7 +402,7 @@ struct AgentScan ScanTile(items, scan_op, prefix_op, Int2Type()); } - CTA_SYNC(); + __syncthreads(); // Store items if (IS_LAST_TILE) @@ -482,7 +487,7 @@ struct AgentScan BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } - CTA_SYNC(); + __syncthreads(); // Block scan if (IS_FIRST_TILE) @@ -496,7 +501,7 @@ struct AgentScan ScanTile(items, scan_op, prefix_op, Int2Type()); } - CTA_SYNC(); + __syncthreads(); // Store items if (IS_LAST_TILE) @@ -582,4 +587,19 @@ struct AgentScan } }; +} // namespace scan +} // namespace detail + +template +using AgentScan CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public interface " + "will be removed.") = detail::scan:: + AgentScan; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh index 6e79ca18d8c..722a44ac074 100644 --- a/cub/cub/agent/agent_scan_by_key.cuh +++ b/cub/cub/agent/agent_scan_by_key.cuh @@ -94,6 +94,11 @@ struct AgentScanByKeyPolicy * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace scan_by_key +{ + /** * @brief AgentScanByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide prefix scan by key. @@ -140,10 +145,10 @@ struct AgentScanByKey // Types and constants //--------------------------------------------------------------------- - using KeyT = cub::detail::value_t; - using InputT = cub::detail::value_t; + using KeyT = value_t; + using InputT = value_t; using FlagValuePairT = KeyValuePair; - using ReduceBySegmentOpT = detail::ScanBySegmentOp; + using ReduceBySegmentOpT = ScanBySegmentOp; using ScanTileStateT = ReduceByKeyScanTileState; @@ -333,7 +338,7 @@ struct AgentScanByKey BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys); } - CTA_SYNC(); + __syncthreads(); if (IS_LAST_TILE) { @@ -347,7 +352,7 @@ struct AgentScanByKey BlockLoadValuesT(storage.load_values).Load(d_values_in + tile_base, values); } - CTA_SYNC(); + __syncthreads(); // first tile if (tile_idx == 0) @@ -386,7 +391,7 @@ struct AgentScanByKey ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type()); } - CTA_SYNC(); + __syncthreads(); UnzipValues(values, scan_items); @@ -460,4 +465,29 @@ struct AgentScanByKey } }; +} // namespace scan_by_key +} // namespace detail + +template +using AgentScanByKey CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::scan_by_key::AgentScanByKey< + AgentScanByKeyPolicyT, + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOp, + ScanOpT, + InitValueT, + OffsetT, + AccumT>; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh index 1cf5eff5008..caabf774ba8 100644 --- a/cub/cub/agent/agent_segment_fixup.cuh +++ b/cub/cub/agent/agent_segment_fixup.cuh @@ -110,6 +110,11 @@ struct AgentSegmentFixupPolicy * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace segment_fixup +{ + /** * @brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for * participating in device-wide reduce-value-by-key @@ -145,7 +150,7 @@ struct AgentSegmentFixup //--------------------------------------------------------------------- // Data type of key-value input iterator - using KeyValuePairT = cub::detail::value_t; + using KeyValuePairT = value_t; // Value type using ValueT = typename KeyValuePairT::Value; @@ -376,7 +381,7 @@ struct AgentSegmentFixup BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); } - CTA_SYNC(); + __syncthreads(); KeyValuePairT tile_aggregate; if (tile_idx == 0) @@ -468,4 +473,23 @@ struct AgentSegmentFixup } }; +} // namespace segment_fixup +} // namespace detail + +template +using AgentSegmentFixup CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::segment_fixup::AgentSegmentFixup< + AgentSegmentFixupPolicyT, + PairsInputIteratorT, + AggregatesOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT>; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_segmented_radix_sort.cuh b/cub/cub/agent/agent_segmented_radix_sort.cuh index fe687fa9f51..e8921aaf045 100644 --- a/cub/cub/agent/agent_segmented_radix_sort.cuh +++ b/cub/cub/agent/agent_segmented_radix_sort.cuh @@ -45,6 +45,11 @@ CUB_NAMESPACE_BEGIN +namespace detail +{ +namespace radix_sort +{ + /** * This agent will be implementing the `DeviceSegmentedRadixSort` when the * https://github.com/NVIDIA/cub/issues/383 is addressed. @@ -69,7 +74,7 @@ template + typename DecomposerT = identity_decomposer_t> struct AgentSegmentedRadixSort { OffsetT num_items; @@ -80,7 +85,7 @@ struct AgentSegmentedRadixSort static constexpr int RADIX_DIGITS = 1 << RADIX_BITS; static constexpr int KEYS_ONLY = std::is_same::value; - using traits = detail::radix::traits_t; + using traits = radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; // Huge segment handlers @@ -154,13 +159,13 @@ struct AgentSegmentedRadixSort { BlockValueLoadT(temp_storage.values_load).Load(d_values_in, thread_values, num_items); - CTA_SYNC(); + __syncthreads(); } { BlockKeyLoadT(temp_storage.keys_load).Load(d_keys_in, thread_keys, num_items, oob_default); - CTA_SYNC(); + __syncthreads(); } BlockRadixSortT(temp_storage.sort) @@ -187,13 +192,13 @@ struct AgentSegmentedRadixSort BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer); upsweep.ProcessRegion(OffsetT{}, num_items); - CTA_SYNC(); + __syncthreads(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); - CTA_SYNC(); + __syncthreads(); if (IS_DESCENDING) { @@ -209,7 +214,7 @@ struct AgentSegmentedRadixSort } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -243,7 +248,7 @@ struct AgentSegmentedRadixSort } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -257,7 +262,7 @@ struct AgentSegmentedRadixSort } } - CTA_SYNC(); + __syncthreads(); // Downsweep BlockDownsweepT downsweep( @@ -275,4 +280,18 @@ struct AgentSegmentedRadixSort } }; +} // namespace radix_sort +} // namespace detail + +template +using AgentSegmentedRadixSort CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::radix_sort::AgentSegmentedRadixSort; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 4f16992b276..37e7b838adf 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -123,6 +123,9 @@ struct AgentSelectIfPolicy namespace detail { +namespace select +{ + template struct partition_distinct_output_t { @@ -132,7 +135,6 @@ struct partition_distinct_output_t selected_iterator_t selected_it; rejected_iterator_t rejected_it; }; -} // namespace detail /** * @brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in @@ -210,13 +212,13 @@ struct AgentSelectIf // If we need to enforce memory order for in-place stream compaction, wrap the default decoupled look-back tile // state in a helper class that enforces memory order on reads and writes - using MemoryOrderedTileStateT = detail::tile_state_with_memory_order; + using MemoryOrderedTileStateT = tile_state_with_memory_order; // The input value type - using InputT = cub::detail::value_t; + using InputT = value_t; // The flag value type - using FlagT = cub::detail::value_t; + using FlagT = value_t; // Constants enum @@ -408,7 +410,7 @@ struct AgentSelectIf OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { - CTA_SYNC(); + __syncthreads(); FlagT flags[ITEMS_PER_THREAD]; if (IS_LAST_TILE) @@ -450,7 +452,7 @@ struct AgentSelectIf OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { - CTA_SYNC(); + __syncthreads(); FlagT flags[ITEMS_PER_THREAD]; @@ -486,7 +488,7 @@ struct AgentSelectIf { if (IS_FIRST_TILE && streaming_context.is_first_partition()) { - CTA_SYNC(); + __syncthreads(); // Set head selection_flags. First tile sets the first flag for the first item BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); @@ -499,7 +501,7 @@ struct AgentSelectIf tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1]; } - CTA_SYNC(); + __syncthreads(); BlockDiscontinuityT(temp_storage.scan_storage.discontinuity) .FlagHeads(selection_flags, items, inequality_op, tile_predecessor); @@ -571,7 +573,7 @@ struct AgentSelectIf int num_tile_selections, OffsetT num_selections_prefix) { - CTA_SYNC(); + __syncthreads(); // Compact and scatter items #pragma unroll @@ -584,7 +586,7 @@ struct AgentSelectIf } } - CTA_SYNC(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { @@ -667,7 +669,7 @@ struct AgentSelectIf OffsetT num_selections, Int2Type /*is_keep_rejects*/) { - CTA_SYNC(); + __syncthreads(); int tile_num_rejections = num_tile_items - num_tile_selections; @@ -685,7 +687,7 @@ struct AgentSelectIf } // Ensure all threads finished scattering to shared memory - CTA_SYNC(); + __syncthreads(); // Gather items from shared memory and scatter to global ScatterPartitionsToGlobal( @@ -702,7 +704,7 @@ struct AgentSelectIf int tile_num_rejections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, - detail::partition_distinct_output_t partitioned_out_wrapper) + partition_distinct_output_t partitioned_out_wrapper) { auto selected_out_it = partitioned_out_wrapper.selected_it + streaming_context.num_previously_selected(); auto rejected_out_it = partitioned_out_wrapper.rejected_it + streaming_context.num_previously_rejected(); @@ -814,7 +816,7 @@ struct AgentSelectIf // Ensure temporary storage used during block load can be reused // Also, in case of in-place stream compaction, this is needed to order the loads of // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state - CTA_SYNC(); + __syncthreads(); // Exclusive scan of selection_flags OffsetT num_tile_selections; @@ -894,7 +896,7 @@ struct AgentSelectIf // Ensure temporary storage used during block load can be reused // Also, in case of in-place stream compaction, this is needed to order the loads of // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state - CTA_SYNC(); + __syncthreads(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT prefix_op( @@ -1014,4 +1016,36 @@ struct AgentSelectIf } }; +} // namespace select +} // namespace detail + +template +using partition_distinct_output_t CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the " + "public interface will be removed.") = + detail::select::partition_distinct_output_t; + +template +using AgentSelectIf CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::select::AgentSelectIf< + AgentSelectIfPolicyT, + InputIteratorT, + FlagsInputIteratorT, + OutputIteratorWrapperT, + SelectOpT, + EqualityOpT, + OffsetT, + StreamingContextT, + KeepRejects, + MayAlias>; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh index 41c40bee28e..80d571d58db 100644 --- a/cub/cub/agent/agent_spmv_orig.cuh +++ b/cub/cub/agent/agent_spmv_orig.cuh @@ -52,6 +52,8 @@ #include #include +#include +#include #include #include @@ -102,7 +104,7 @@ template -struct AgentSpmvPolicy +struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmvPolicy { enum { @@ -148,7 +150,12 @@ struct AgentSpmvPolicy * Signed integer type for sequence offsets */ template -struct SpmvParams +struct +// with NVHPC, we get a deprecation warning in the implementation of cudaLaunchKernelEx, which we cannot suppress :/ +#if !_CCCL_COMPILER(NVHPC) + CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") +#endif + SpmvParams { /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix /// A. @@ -211,7 +218,7 @@ template -struct AgentSpmv +struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv { //--------------------------------------------------------------------- // Types and constants @@ -308,7 +315,9 @@ struct AgentSpmv /// Reference to temp_storage _TempStorage& temp_storage; + _CCCL_SUPPRESS_DEPRECATED_PUSH SpmvParams& spmv_params; + _CCCL_SUPPRESS_DEPRECATED_POP /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements /// of matrix A. @@ -341,6 +350,7 @@ struct AgentSpmv * @param spmv_params * SpMV input parameter bundle */ + _CCCL_SUPPRESS_DEPRECATED_PUSH _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams& spmv_params) : temp_storage(temp_storage.Alias()) , spmv_params(spmv_params) @@ -350,6 +360,7 @@ struct AgentSpmv , wd_vector_x(spmv_params.d_vector_x) , wd_vector_y(spmv_params.d_vector_y) {} + _CCCL_SUPPRESS_DEPRECATED_POP /** * @brief Consume a merge tile, specialized for direct-load of nonzeros @@ -367,12 +378,12 @@ struct AgentSpmv // Gather the row end-offsets for the merge tile into shared memory for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { - const OffsetT offset = - (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); + const OffsetT offset = (::cuda::std::min)( + static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } - CTA_SYNC(); + __syncthreads(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); @@ -386,7 +397,7 @@ struct AgentSpmv tile_num_nonzeros, thread_start_coord); - CTA_SYNC(); // Perf-sync + __syncthreads(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; @@ -425,7 +436,7 @@ struct AgentSpmv } } - CTA_SYNC(); + __syncthreads(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; @@ -548,12 +559,12 @@ struct AgentSpmv #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { - const OffsetT offset = - (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); + const OffsetT offset = (::cuda::std::min)( + static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } - CTA_SYNC(); + __syncthreads(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); @@ -567,7 +578,7 @@ struct AgentSpmv tile_num_nonzeros, thread_start_coord); - CTA_SYNC(); // Perf-sync + __syncthreads(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; @@ -600,7 +611,7 @@ struct AgentSpmv scan_segment[ITEM].key = thread_current_coord.x; } - CTA_SYNC(); + __syncthreads(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; @@ -620,7 +631,7 @@ struct AgentSpmv if (tile_num_rows > 0) { - CTA_SYNC(); + __syncthreads(); // Scan downsweep and scatter ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; @@ -647,7 +658,7 @@ struct AgentSpmv } } - CTA_SYNC(); + __syncthreads(); #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) @@ -709,7 +720,7 @@ struct AgentSpmv } } - CTA_SYNC(); + __syncthreads(); CoordinateT tile_start_coord = temp_storage.tile_coords[0]; CoordinateT tile_end_coord = temp_storage.tile_coords[1]; diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh index f07b2173cdc..b10f1cda3ea 100644 --- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh @@ -77,6 +77,11 @@ struct AgentSmallAndMediumSegmentedSortPolicy static constexpr int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS; }; +namespace detail +{ +namespace sub_warp_merge_sort +{ + /** * @brief AgentSubWarpSort implements a sub-warp merge sort. * @@ -168,8 +173,8 @@ class AgentSubWarpSort // LOWEST -> -nan = 11...11b -> TwiddleIn -> 0 = 00...00b // Segmented sort doesn't support custom types at the moment. - bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(detail::identity_decomposer_t{}) - : traits::max_raw_binary_key(detail::identity_decomposer_t{}); + bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(identity_decomposer_t{}) + : traits::max_raw_binary_key(identity_decomposer_t{}); return reinterpret_cast(default_key_bits); } @@ -233,23 +238,23 @@ public: KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type::value>{}); WarpLoadKeysT(storage.load_keys).Load(keys_input, keys, segment_size, oob_default); - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); if (!KEYS_ONLY) { WarpLoadItemsT(storage.load_items).Load(values_input, values, segment_size); - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); } warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default); - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size); if (!KEYS_ONLY) { - WARP_SYNC(warp_merge_sort.get_member_mask()); + __syncwarp(warp_merge_sort.get_member_mask()); WarpStoreItemsT(storage.store_items).Store(values_output, values, segment_size); } } @@ -331,4 +336,12 @@ private: } }; +} // namespace sub_warp_merge_sort +} // namespace detail + +template +using AgentSubWarpSort CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::sub_warp_merge_sort::AgentSubWarpSort; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index eec24057163..047861254ac 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -56,6 +56,26 @@ CUB_NAMESPACE_BEGIN * Tuning policy types ******************************************************************************/ +template > +struct AgentThreeWayPartitionPolicy +{ + static constexpr int BLOCK_THREADS = _BLOCK_THREADS; + static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; + static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + + struct detail + { + using delay_constructor_t = DelayConstructorT; + }; +}; + namespace detail { @@ -135,30 +155,6 @@ struct accumulator_pack_t : accumulator_pack_base_t } }; -} // namespace three_way_partition - -} // namespace detail - -template > -struct AgentThreeWayPartitionPolicy -{ - static constexpr int BLOCK_THREADS = _BLOCK_THREADS; - static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; - static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; - - struct detail - { - using delay_constructor_t = DelayConstructorT; - }; -}; - /** * \brief Implements a device-wide three-way partitioning * @@ -184,9 +180,9 @@ struct AgentThreeWayPartition //--------------------------------------------------------------------- // The input value type - using InputT = cub::detail::value_t; + using InputT = value_t; - using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; + using AccumPackHelperT = accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; // Tile status descriptor interface type @@ -313,7 +309,7 @@ struct AgentThreeWayPartition AccumPackT num_tile_selected_prefix, OffsetT num_rejected_prefix) { - CTA_SYNC(); + __syncthreads(); const OffsetT num_first_selections_prefix = AccumPackHelperT::first(num_tile_selected_prefix); const OffsetT num_second_selections_prefix = AccumPackHelperT::second(num_tile_selected_prefix); @@ -353,7 +349,7 @@ struct AgentThreeWayPartition } } - CTA_SYNC(); + __syncthreads(); // Gather items from shared memory and scatter to global auto first_base = @@ -421,7 +417,7 @@ struct AgentThreeWayPartition // Initialize selection_flags Initialize(num_tile_items, items, items_selection_flags); - CTA_SYNC(); + __syncthreads(); // Exclusive scan of selection_flags BlockScanT(temp_storage.scan_storage.scan) @@ -486,7 +482,7 @@ struct AgentThreeWayPartition // Initialize selection_flags Initialize(num_tile_items, items, items_selected_flags); - CTA_SYNC(); + __syncthreads(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx); @@ -497,7 +493,7 @@ struct AgentThreeWayPartition AccumPackT num_items_in_tile_selected = prefix_op.GetBlockAggregate(); AccumPackT num_items_selected_prefix = prefix_op.GetExclusivePrefix(); - CTA_SYNC(); + __syncthreads(); OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - AccumPackHelperT::sum(num_items_selected_prefix); @@ -593,4 +589,7 @@ struct AgentThreeWayPartition } }; +} // namespace three_way_partition +} // namespace detail + CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index 30f5d4f50e4..a1a731f150f 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -92,6 +92,11 @@ struct AgentUniqueByKeyPolicy * Thread block abstractions ******************************************************************************/ +namespace detail +{ +namespace unique_by_key +{ + /** * @brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating * in device-wide unique-by-key @@ -286,7 +291,7 @@ struct AgentUniqueByKey } } - CTA_SYNC(); + __syncthreads(); // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset // types @@ -296,7 +301,7 @@ struct AgentUniqueByKey items_out[num_selections_prefix + item] = GetShared(tag)[item]; } - CTA_SYNC(); + __syncthreads(); } //--------------------------------------------------------------------- @@ -337,7 +342,7 @@ struct AgentUniqueByKey BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } - CTA_SYNC(); + __syncthreads(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) @@ -352,7 +357,7 @@ struct AgentUniqueByKey BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values); } - CTA_SYNC(); + __syncthreads(); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, keys, inequality_op); #pragma unroll @@ -365,7 +370,7 @@ struct AgentUniqueByKey } } - CTA_SYNC(); + __syncthreads(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; @@ -390,7 +395,7 @@ struct AgentUniqueByKey } num_selections = num_tile_selections; - CTA_SYNC(); + __syncthreads(); Scatter(KeyTagT(), d_keys_out, @@ -402,7 +407,7 @@ struct AgentUniqueByKey num_selections_prefix, num_selections); - CTA_SYNC(); + __syncthreads(); Scatter(ValueTagT(), d_values_out, @@ -454,7 +459,7 @@ struct AgentUniqueByKey BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } - CTA_SYNC(); + __syncthreads(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) @@ -469,7 +474,7 @@ struct AgentUniqueByKey BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values); } - CTA_SYNC(); + __syncthreads(); KeyT tile_predecessor = d_keys_in[tile_offset - 1]; BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) @@ -485,7 +490,7 @@ struct AgentUniqueByKey } } - CTA_SYNC(); + __syncthreads(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; @@ -505,7 +510,7 @@ struct AgentUniqueByKey num_selections -= num_discount; } - CTA_SYNC(); + __syncthreads(); Scatter(KeyTagT(), d_keys_out, @@ -517,7 +522,7 @@ struct AgentUniqueByKey num_selections_prefix, num_selections); - CTA_SYNC(); + __syncthreads(); Scatter(ValueTagT(), d_values_out, @@ -606,4 +611,25 @@ struct AgentUniqueByKey } }; +} // namespace unique_by_key +} // namespace detail + +template +using AgentUniqueByKey CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") = + detail::unique_by_key::AgentUniqueByKey< + AgentUniqueByKeyPolicyT, + KeyInputIteratorT, + ValueInputIteratorT, + KeyOutputIteratorT, + ValueOutputIteratorT, + EqualityOpT, + OffsetT>; + CUB_NAMESPACE_END diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 71469a0055a..bd6551b8f8d 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -733,7 +733,7 @@ public: tile_descriptor = reinterpret_cast(alias); } - while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)) + while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID))) { delay_or_prevent_hoisting(); TxnWord alias = LoadStatus(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); @@ -918,7 +918,7 @@ struct ScanTileState delay(); status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx); __threadfence(); - } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff)); + } while (__any_sync(0xffffffff, (status == SCAN_TILE_INVALID))); if (status == StatusWord(SCAN_TILE_PARTIAL)) { @@ -1145,7 +1145,7 @@ struct ReduceByKeyScanTileState TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); - } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + } while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID))); status = tile_descriptor.status; value.value = tile_descriptor.value; @@ -1268,7 +1268,7 @@ struct TilePrefixCallbackOp exclusive_prefix = window_aggregate; // Keep sliding the window back until we come across a tile whose inclusive prefix is known - while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) + while (__all_sync(0xffffffff, (predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)))) { predecessor_idx -= CUB_PTX_WARP_THREADS; diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh index 5bc3bae3219..38636571e80 100644 --- a/cub/cub/block/block_adjacent_difference.cuh +++ b/cub/cub/block/block_adjacent_difference.cuh @@ -309,7 +309,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) @@ -408,7 +408,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) @@ -499,7 +499,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { @@ -622,7 +622,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { @@ -736,7 +736,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) @@ -837,7 +837,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) @@ -926,7 +926,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items) { diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh index fb88dfac07f..e4998f32510 100644 --- a/cub/cub/block/block_discontinuity.cuh +++ b/cub/cub/block/block_discontinuity.cuh @@ -292,7 +292,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); if (linear_tid == 0) { @@ -337,7 +337,7 @@ public: // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread @@ -586,7 +586,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = @@ -686,7 +686,7 @@ public: // Share first item temp_storage.first_items[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread @@ -790,7 +790,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; @@ -920,7 +920,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; @@ -1052,7 +1052,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; @@ -1189,7 +1189,7 @@ public: temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); T preds[ITEMS_PER_THREAD]; diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index bdc2a3dc932..d1ae91c223d 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -47,6 +47,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -179,7 +181,7 @@ private: // TODO(bgruber): can we use signed int here? Only these variables are unsigned: unsigned int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); - unsigned int lane_id = LaneId(); + unsigned int lane_id = ::cuda::ptx::get_sreg_laneid(); unsigned int warp_id = WARPS == 1 ? 0 : linear_tid / WARP_THREADS; unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS; @@ -215,7 +217,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -249,7 +251,7 @@ private: const int slice_offset = slice * TIME_SLICED_ITEMS; const int slice_oob = slice_offset + TIME_SLICED_ITEMS; - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -265,7 +267,7 @@ private: } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -322,7 +324,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -361,7 +363,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -378,7 +380,7 @@ private: #pragma unroll for (int slice = 1; slice < TIME_SLICES; ++slice) { - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -393,7 +395,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -434,7 +436,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); // No timeslicing #pragma unroll @@ -470,7 +472,7 @@ private: const int slice_offset = slice * TIME_SLICED_ITEMS; const int slice_oob = slice_offset + TIME_SLICED_ITEMS; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -493,7 +495,7 @@ private: } } - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -543,7 +545,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -572,7 +574,7 @@ private: #pragma unroll for (int slice = 0; slice < TIME_SLICES; ++slice) { - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -587,7 +589,7 @@ private: detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - WARP_SYNC(0xffffffff); + __syncwarp(0xffffffff); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -626,12 +628,12 @@ private: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -639,7 +641,7 @@ private: int item_offset = linear_tid * ITEMS_PER_THREAD + i; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } @@ -667,7 +669,7 @@ private: #pragma unroll for (int slice = 0; slice < TIME_SLICES; slice++) { - CTA_SYNC(); + __syncthreads(); const int slice_offset = TIME_SLICED_ITEMS * slice; @@ -679,13 +681,13 @@ private: { _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } - CTA_SYNC(); + __syncthreads(); if (warp_id == slice) { @@ -695,7 +697,7 @@ private: int item_offset = lane_id * ITEMS_PER_THREAD + i; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } temp_items[i] = temp_storage.buff[item_offset]; } @@ -733,12 +735,12 @@ private: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -746,7 +748,7 @@ private: int item_offset = i * BLOCK_THREADS + linear_tid; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } @@ -777,7 +779,7 @@ private: const int slice_offset = slice * TIME_SLICED_ITEMS; const int slice_oob = slice_offset + TIME_SLICED_ITEMS; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -787,13 +789,13 @@ private: { _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -1134,7 +1136,7 @@ public: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } if (ranks[i] >= 0) { @@ -1142,7 +1144,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -1150,7 +1152,7 @@ public: int item_offset = i * BLOCK_THREADS + linear_tid; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } @@ -1193,7 +1195,7 @@ public: int item_offset = ranks[i]; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } if (is_valid[i]) { @@ -1201,7 +1203,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; i++) @@ -1209,7 +1211,7 @@ public: int item_offset = i * BLOCK_THREADS + linear_tid; _CCCL_IF_CONSTEXPR (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[i] = temp_storage.buff[item_offset]; } diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh index d5726f240f6..41abbd588b3 100644 --- a/cub/cub/block/block_histogram.cuh +++ b/cub/cub/block/block_histogram.cuh @@ -202,8 +202,8 @@ private: /// Internal specialization. using InternalBlockHistogram = ::cuda::std::_If, - BlockHistogramAtomic>; + detail::BlockHistogramSort, + detail::BlockHistogramAtomic>; /// Shared memory storage layout type for BlockHistogram using _TempStorage = typename InternalBlockHistogram::TempStorage; @@ -358,7 +358,7 @@ public: // Initialize histogram bin counts to zeros InitHistogram(histogram); - CTA_SYNC(); + __syncthreads(); // Composite the histogram InternalBlockHistogram(temp_storage).Composite(items, histogram); diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index b6d0c8a33b1..3ade5eb1609 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -43,6 +43,8 @@ #include #include +#include +#include #include CUB_NAMESPACE_BEGIN @@ -58,7 +60,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred) { OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count; - OffsetT keys1_end = (cub::min)(diag, keys1_count); + OffsetT keys1_end = (::cuda::std::min)(diag, keys1_count); while (keys1_begin < keys1_end) { @@ -425,12 +427,12 @@ public: int thread_idx_in_thread_group_being_merged = mask & linear_tid; - int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged); + int diag = (::cuda::std::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged); - int keys1_beg = (cub::min)(valid_items, start); - int keys1_end = (cub::min)(valid_items, keys1_beg + size); + int keys1_beg = (::cuda::std::min)(valid_items, start); + int keys1_end = (::cuda::std::min)(valid_items, keys1_beg + size); int keys2_beg = keys1_end; - int keys2_end = (cub::min)(valid_items, keys2_beg + size); + int keys2_end = (::cuda::std::min)(valid_items, keys2_beg + size); int keys1_count = keys1_end - keys1_beg; int keys2_count = keys2_end - keys2_beg; @@ -760,7 +762,7 @@ public: private: _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const { - CTA_SYNC(); + __syncthreads(); } friend BlockMergeSortStrategyT; diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 92605b5168d..ad495e1db31 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -48,6 +48,7 @@ #include #include +#include #include #include #include @@ -477,12 +478,12 @@ public: *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; } - CTA_SYNC(); + __syncthreads(); // Scan shared memory counters ScanCounters(); - CTA_SYNC(); + __syncthreads(); // Extract the local ranks of each key #pragma unroll @@ -710,13 +711,13 @@ public: temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; } - CTA_SYNC(); + __syncthreads(); // Each warp will strip-mine its section of input, one strip at a time volatile DigitCounterT* digit_counters[KEYS_PER_THREAD]; uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; - uint32_t lane_mask_lt = LaneMaskLt(); + uint32_t lane_mask_lt = ::cuda::ptx::get_sreg_lanemask_lt(); #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) @@ -740,7 +741,7 @@ public: DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; // Warp-sync - WARP_SYNC(0xFFFFFFFF); + __syncwarp(0xFFFFFFFF); // Number of peers having same digit as me int32_t digit_count = __popc(peer_mask); @@ -755,13 +756,13 @@ public: } // Warp-sync - WARP_SYNC(0xFFFFFFFF); + __syncwarp(0xFFFFFFFF); // Number of prior keys having same digit ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); } - CTA_SYNC(); + __syncthreads(); // Scan warp counters @@ -781,7 +782,7 @@ public: temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; } - CTA_SYNC(); + __syncthreads(); if (!::cuda::std::is_same>::value) { CallBack(callback); @@ -977,7 +978,7 @@ struct BlockRadixRankMatchEarlyCounts match_masks[bin] = 0; } } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); // compute private per-part histograms int part = lane % NUM_PARTS; @@ -991,7 +992,7 @@ struct BlockRadixRankMatchEarlyCounts // no extra work is necessary if NUM_PARTS == 1 if (NUM_PARTS > 1) { - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary constexpr int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS; int bins[WARP_BINS_PER_THREAD]; @@ -1001,7 +1002,7 @@ struct BlockRadixRankMatchEarlyCounts int bin = lane + u * WARP_THREADS; bins[u] = cub::ThreadReduce(warp_histograms[bin], ::cuda::std::plus<>{}); } - CTA_SYNC(); + __syncthreads(); // store the resulting histogram in shared memory int* warp_offsets = &s.warp_offsets[warp][0]; @@ -1066,22 +1067,22 @@ struct BlockRadixRankMatchEarlyCounts ::cuda::std::uint32_t bin = Digit(keys[u]); int* p_match_mask = &match_masks[bin]; atomicOr(p_match_mask, lane_mask); - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); int bin_mask = *p_match_mask; int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; - int popc = __popc(bin_mask & LaneMaskLe()); + int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } - warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK); + warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader); if (lane == leader) { *p_match_mask = 0; } - WARP_SYNC(WARP_MASK); + __syncwarp(WARP_MASK); ranks[u] = warp_offset + popc - 1; } } @@ -1099,13 +1100,13 @@ struct BlockRadixRankMatchEarlyCounts detail::warp_in_block_matcher_t::match_any(bin, warp); int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; - int popc = __popc(bin_mask & LaneMaskLe()); + int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } - warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK); + warp_offset = __shfl_sync(WARP_MASK, warp_offset, leader); ranks[u] = warp_offset + popc - 1; } } @@ -1117,7 +1118,7 @@ struct BlockRadixRankMatchEarlyCounts { ComputeHistogramsWarp(keys); - CTA_SYNC(); + __syncthreads(); int bins[BINS_PER_THREAD]; ComputeOffsetsWarpUpsweep(bins); callback(bins); @@ -1125,7 +1126,7 @@ struct BlockRadixRankMatchEarlyCounts BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix); ComputeOffsetsWarpDownsweep(exclusive_digit_prefix); - CTA_SYNC(); + __syncthreads(); ComputeRanksItem(keys, ranks, Int2Type()); } @@ -1135,7 +1136,7 @@ struct BlockRadixRankMatchEarlyCounts , digit_extractor(digit_extractor) , callback(callback) , warp(threadIdx.x / WARP_THREADS) - , lane(LaneId()) + , lane(::cuda::ptx::get_sreg_laneid()) {} }; diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 3223b920b13..080053348d7 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -364,7 +364,7 @@ private: Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { - CTA_SYNC(); + __syncthreads(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); @@ -377,7 +377,7 @@ private: Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { - CTA_SYNC(); + __syncthreads(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); @@ -443,7 +443,7 @@ private: RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; - CTA_SYNC(); + __syncthreads(); // Exchange keys through shared memory in blocked arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); @@ -457,7 +457,7 @@ private: break; } - CTA_SYNC(); + __syncthreads(); } // Untwiddle bits if necessary @@ -522,7 +522,7 @@ public: RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; - CTA_SYNC(); + __syncthreads(); // Check if this is the last pass if (begin_bit >= end_bit) @@ -543,7 +543,7 @@ public: // Exchange values through shared memory in blocked arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); - CTA_SYNC(); + __syncthreads(); } // Untwiddle bits if necessary diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index 6828f6fa706..6cf578963fc 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -250,9 +250,9 @@ private: BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; - using WarpReductions = BlockReduceWarpReductions; - using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly; - using Raking = BlockReduceRaking; + using WarpReductions = detail::BlockReduceWarpReductions; + using RakingCommutativeOnly = detail::BlockReduceRakingCommutativeOnly; + using Raking = detail::BlockReduceRaking; /// Internal specialization type using InternalBlockReduce = diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index 0dca0a5d838..467d9141dc3 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -44,6 +44,9 @@ #include #include +#include +#include + #include #include @@ -284,7 +287,7 @@ private: for (int i = 0; i <= Log2::VALUE; i++) { OffsetT mid = cub::MidPoint(lower_bound, upper_bound); - mid = (cub::min)(mid, num_items - 1); + mid = (::cuda::std::min)(mid, num_items - 1); if (val < input[mid]) { @@ -314,7 +317,7 @@ private: } // Ensure run offsets and run values have been written to shared memory - CTA_SYNC(); + __syncthreads(); } template @@ -335,7 +338,7 @@ private: total_decoded_size = static_cast(decoded_size_aggregate); // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) - CTA_SYNC(); + __syncthreads(); InitWithRunOffsets(run_values, run_offsets); } diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index c49eb36a52e..c25bd2d258d 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -250,9 +250,9 @@ private: ? BLOCK_SCAN_RAKING : ALGORITHM; - using WarpScans = BlockScanWarpScans; + using WarpScans = detail::BlockScanWarpScans; using Raking = - BlockScanRaking; + detail::BlockScanRaking; /// Define the delegate type for the desired algorithm using InternalBlockScan = ::cuda::std::_If; @@ -477,7 +477,7 @@ public: //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -714,17 +714,17 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage.scan).ExclusiveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. @@ -957,7 +957,7 @@ public: //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -1230,16 +1230,16 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage.scan).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. @@ -1618,7 +1618,7 @@ public: //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -1874,16 +1874,16 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage.scan).IncluisveSum( //! thread_data, thread_data, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. @@ -2123,7 +2123,7 @@ public: //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan( //! thread_data, thread_data, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; @@ -2516,16 +2516,16 @@ public: //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage.scan).InclusiveScan( //! thread_data, thread_data, cuda::maximum<>{}, prefix_op); - //! CTA_SYNC(); + //! __syncthreads(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - //! CTA_SYNC(); + //! __syncthreads(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index a3dedcc3c70..93d8715c63b 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -164,7 +164,7 @@ public: { temp_storage[linear_tid] = input; - CTA_SYNC(); + __syncthreads(); const int offset_tid = static_cast(linear_tid) + distance; if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS)) @@ -196,7 +196,7 @@ public: { temp_storage[linear_tid] = input; - CTA_SYNC(); + __syncthreads(); unsigned int offset = linear_tid + distance; if (offset >= BLOCK_THREADS) @@ -230,7 +230,7 @@ public: { temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) @@ -298,7 +298,7 @@ public: { temp_storage[linear_tid] = input[0]; - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++) diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index 443f7a7f93b..e207a1d76c1 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -897,7 +897,7 @@ private: // subsequent loads temp_storage.valid_items = valid_items; } - CTA_SYNC(); + __syncthreads(); StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; @@ -980,7 +980,7 @@ private: // subsequent loads temp_storage.valid_items = valid_items; } - CTA_SYNC(); + __syncthreads(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; @@ -1063,7 +1063,7 @@ private: // subsequent loads temp_storage.valid_items = valid_items; } - CTA_SYNC(); + __syncthreads(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh index d4fdd9c405f..35bdfe8ee02 100644 --- a/cub/cub/block/radix_rank_sort_operations.cuh +++ b/cub/cub/block/radix_rank_sort_operations.cuh @@ -49,6 +49,8 @@ #include +#include +#include #include #include #include @@ -437,7 +439,7 @@ struct digit_f using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; - const ::cuda::std::uint32_t bits_to_copy = min(src_size - src_bit_start, num_bits); + const ::cuda::std::uint32_t bits_to_copy = (::cuda::std::min)(src_size - src_bit_start, num_bits); if (bits_to_copy) { diff --git a/cub/cub/block/specializations/block_histogram_atomic.cuh b/cub/cub/block/specializations/block_histogram_atomic.cuh index 8edc8575c40..4103641dbe2 100644 --- a/cub/cub/block/specializations/block_histogram_atomic.cuh +++ b/cub/cub/block/specializations/block_histogram_atomic.cuh @@ -45,7 +45,8 @@ #endif // no system header CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. @@ -72,7 +73,7 @@ struct BlockHistogramAtomic template _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS]) { -// Update histogram + // Update histogram #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { @@ -80,5 +81,11 @@ struct BlockHistogramAtomic } } }; +} // namespace detail + +template +using BlockHistogramAtomic CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockHistogramAtomic; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh index 7ef3c1264a5..127f30953b2 100644 --- a/cub/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/cub/block/specializations/block_histogram_sort.cuh @@ -49,7 +49,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. @@ -187,7 +188,7 @@ struct BlockHistogramSort // Sort bytes in blocked arrangement BlockRadixSortT(temp_storage.sort).Sort(items); - CTA_SYNC(); + __syncthreads(); // Initialize the shared memory's run_begin and run_end for each bin int histo_offset = 0; @@ -205,7 +206,7 @@ struct BlockHistogramSort temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE; } - CTA_SYNC(); + __syncthreads(); int flags[ITEMS_PER_THREAD]; // unused @@ -219,7 +220,7 @@ struct BlockHistogramSort temp_storage.discontinuities.run_begin[items[0]] = 0; } - CTA_SYNC(); + __syncthreads(); // Composite into histogram histo_offset = 0; @@ -243,5 +244,18 @@ struct BlockHistogramSort } } }; +} // namespace detail + +template +using BlockHistogramSort CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = + detail::BlockHistogramSort; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh index 7c1db2c9050..90f8f12236f 100644 --- a/cub/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/cub/block/specializations/block_reduce_raking.cuh @@ -50,7 +50,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread * block. Supports non-commutative reduction operators. @@ -212,7 +213,7 @@ struct BlockReduceRaking // Place partial into shared memory grid. *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; - CTA_SYNC(); + __syncthreads(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) @@ -228,7 +229,7 @@ struct BlockReduceRaking // sync before re-using shmem (warp_storage/raking_grid are aliased) static_assert(RAKING_THREADS <= CUB_PTX_WARP_THREADS, "RAKING_THREADS must be <= warp size."); unsigned int mask = static_cast((1ull << RAKING_THREADS) - 1); - WARP_SYNC(mask); + __syncwarp(mask); partial = WarpReduce(temp_storage.warp_storage) .template Reduce<(IS_FULL_TILE && RAKING_UNGUARDED)>(partial, valid_raking_threads, reduction_op); @@ -257,5 +258,11 @@ struct BlockReduceRaking return Reduce(partial, num_valid, reduction_op); } }; +} // namespace detail + +template +using BlockReduceRaking CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockReduceRaking; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 49401e87fb4..7841db5f18a 100644 --- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -50,7 +50,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction * across a CUDA thread block. Does not support non-commutative reduction operators. Does not @@ -83,7 +84,7 @@ struct BlockReduceRakingCommutativeOnly // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have // valid values - using FallBack = BlockReduceRaking; + using FallBack = detail::BlockReduceRaking; /// Constants enum @@ -167,7 +168,7 @@ struct BlockReduceRakingCommutativeOnly partial; } - CTA_SYNC(); + __syncthreads(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) @@ -214,7 +215,7 @@ struct BlockReduceRakingCommutativeOnly partial; } - CTA_SYNC(); + __syncthreads(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) @@ -231,5 +232,11 @@ struct BlockReduceRakingCommutativeOnly return partial; } }; +} // namespace detail + +template +using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockReduceRakingCommutativeOnly; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index 4ee2b307bcf..2dfa526771f 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -48,8 +48,11 @@ #include #include -CUB_NAMESPACE_BEGIN +#include +CUB_NAMESPACE_BEGIN +namespace detail +{ /** * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction * across a CUDA thread block. Supports non-commutative reduction operators. @@ -121,7 +124,7 @@ struct BlockReduceWarpReductions : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , lane_id(LaneId()) + , lane_id(::cuda::ptx::get_sreg_laneid()) {} /** @@ -184,7 +187,7 @@ struct BlockReduceWarpReductions detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate); } - CTA_SYNC(); + __syncthreads(); // Update total aggregate in warp 0, lane 0 if (linear_tid == 0) @@ -254,5 +257,11 @@ struct BlockReduceWarpReductions return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } }; +} // namespace detail + +template +using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockReduceWarpReductions; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh index f0fe7a5ca2a..2af4b8693fc 100644 --- a/cub/cub/block/specializations/block_scan_raking.cuh +++ b/cub/cub/block/specializations/block_scan_raking.cuh @@ -52,7 +52,8 @@ #include CUB_NAMESPACE_BEGIN - +namespace detail +{ /** * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA * thread block. @@ -302,7 +303,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -318,7 +319,7 @@ struct BlockScanRaking ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory exclusive_output = *placement_ptr; @@ -355,7 +356,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -371,7 +372,7 @@ struct BlockScanRaking ExclusiveDownsweep(scan_op, exclusive_partial); } - CTA_SYNC(); + __syncthreads(); // Grab exclusive partial from shared memory output = *placement_ptr; @@ -410,7 +411,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -433,7 +434,7 @@ struct BlockScanRaking } } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -478,7 +479,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -501,7 +502,7 @@ struct BlockScanRaking } } - CTA_SYNC(); + __syncthreads(); // Grab exclusive partial from shared memory output = *placement_ptr; @@ -559,7 +560,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -588,7 +589,7 @@ struct BlockScanRaking ExclusiveDownsweep(scan_op, downsweep_prefix); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -626,7 +627,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -642,7 +643,7 @@ struct BlockScanRaking InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -680,7 +681,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -703,7 +704,7 @@ struct BlockScanRaking } } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -758,7 +759,7 @@ struct BlockScanRaking T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy_single(placement_ptr, input); - CTA_SYNC(); + __syncthreads(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) @@ -787,12 +788,18 @@ struct BlockScanRaking InclusiveDownsweep(scan_op, downsweep_prefix); } - CTA_SYNC(); + __syncthreads(); // Grab thread prefix from shared memory output = *placement_ptr; } } }; +} // namespace detail + +template +using BlockScanRaking CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockScanRaking; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh index 851a71cbe7b..d034d2838ea 100644 --- a/cub/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh @@ -47,8 +47,11 @@ #include #include -CUB_NAMESPACE_BEGIN +#include +CUB_NAMESPACE_BEGIN +namespace detail +{ /** * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA * thread block. @@ -127,7 +130,7 @@ struct BlockScanWarpScans : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , lane_id(LaneId()) + , lane_id(::cuda::ptx::get_sreg_laneid()) {} //--------------------------------------------------------------------- @@ -197,7 +200,7 @@ struct BlockScanWarpScans detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate); } - CTA_SYNC(); + __syncthreads(); // Accumulate block aggregates and save the one that is our warp's prefix T warp_prefix; @@ -423,7 +426,7 @@ struct BlockScanWarpScans } } - CTA_SYNC(); + __syncthreads(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; @@ -528,12 +531,17 @@ struct BlockScanWarpScans } } - CTA_SYNC(); + __syncthreads(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; exclusive_output = scan_op(block_prefix, exclusive_output); } }; +} // namespace detail +template +using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::BlockScanWarpScans; CUB_NAMESPACE_END diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh deleted file mode 100644 index 1d71c6ebc0d..00000000000 --- a/cub/cub/detail/device_synchronize.cuh +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2021 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include - -#include - -#include - -CUB_NAMESPACE_BEGIN - -namespace detail -{ - -/** - * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and - * CUDA configuration. - */ -_CCCL_EXEC_CHECK_DISABLE -CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() -{ - cudaError_t result = cudaErrorNotSupported; - NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), ()); - return result; -} - -} // namespace detail - -CUB_NAMESPACE_END diff --git a/cub/cub/detail/temporary_storage.cuh b/cub/cub/detail/temporary_storage.cuh index cf5f98e775a..f271ce804a9 100644 --- a/cub/cub/detail/temporary_storage.cuh +++ b/cub/cub/detail/temporary_storage.cuh @@ -29,6 +29,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -96,7 +98,7 @@ public: private: _CCCL_HOST_DEVICE void set_bytes_required(std::size_t new_size) { - m_size = (max) (m_size, new_size); + m_size = (::cuda::std::max)(m_size, new_size); } _CCCL_HOST_DEVICE std::size_t get_bytes_required() const diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index 1af5f01f033..b910bb91a2b 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -43,8 +43,6 @@ #include #include -#include - #include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 1fb5656b82f..2347666289e 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -41,10 +41,13 @@ # pragma system_header #endif // no system header +#include #include #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -140,16 +143,19 @@ private: std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; + + using OffsetT = + detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; @@ -286,8 +292,8 @@ public: std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -313,16 +319,19 @@ private: std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; + + using OffsetT = + detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; @@ -454,8 +463,8 @@ public: std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -480,17 +489,18 @@ private: void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; - + using OffsetT = + detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_values; @@ -632,8 +642,8 @@ public: void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -650,17 +660,18 @@ private: void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; - + using OffsetT = + detail::choose_signed_offset_t>; using DispatchT = - DispatchSegmentedSort; + DispatchSegmentedSort; DoubleBuffer d_values; @@ -803,8 +814,8 @@ public: void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -931,8 +942,8 @@ public: std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -1067,8 +1078,8 @@ public: std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -1213,8 +1224,8 @@ public: void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -1350,8 +1361,8 @@ public: void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -1371,15 +1382,19 @@ private: KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; - using DispatchT = DispatchSegmentedSort; + + using OffsetT = + detail::choose_signed_offset_t>; + using DispatchT = + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); @@ -1539,8 +1554,8 @@ public: KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -1570,15 +1585,19 @@ private: KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; - using DispatchT = DispatchSegmentedSort; + + using OffsetT = + detail::choose_signed_offset_t>; + using DispatchT = + DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); @@ -1734,8 +1753,8 @@ public: KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -1763,15 +1782,19 @@ private: std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; - using DispatchT = DispatchSegmentedSort; + + using OffsetT = + detail::choose_signed_offset_t>; + using DispatchT = + DispatchSegmentedSort; return DispatchT::Dispatch( d_temp_storage, @@ -1931,8 +1954,8 @@ public: std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -1958,15 +1981,19 @@ private: std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; - using DispatchT = DispatchSegmentedSort; + + using OffsetT = + detail::choose_signed_offset_t>; + using DispatchT = + DispatchSegmentedSort; return DispatchT::Dispatch( d_temp_storage, @@ -2125,8 +2152,8 @@ public: std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -2281,8 +2308,8 @@ public: KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -2439,8 +2466,8 @@ public: KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -2605,8 +2632,8 @@ public: std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) @@ -2768,8 +2795,8 @@ public: std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - int num_items, - int num_segments, + ::cuda::std::int64_t num_items, + ::cuda::std::int64_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh index 5a751181842..241af8cd1d1 100644 --- a/cub/cub/device/device_spmv.cuh +++ b/cub/cub/device/device_spmv.cuh @@ -78,7 +78,7 @@ CUB_NAMESPACE_BEGIN //! @cdp_class{DeviceSpmv} //! //! @endrst -struct DeviceSpmv +struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DeviceSpmv { //! @name CSR matrix operations //! @{ @@ -177,18 +177,19 @@ struct DeviceSpmv //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template - CUB_RUNTIME_FUNCTION static cudaError_t CsrMV( - void* d_temp_storage, - size_t& temp_storage_bytes, - const ValueT* d_values, - const int* d_row_offsets, - const int* d_column_indices, - const ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - cudaStream_t stream = 0) + CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") + CUB_RUNTIME_FUNCTION static cudaError_t + CsrMV(void* d_temp_storage, + size_t& temp_storage_bytes, + const ValueT* d_values, + const int* d_row_offsets, + const int* d_column_indices, + const ValueT* d_vector_x, + ValueT* d_vector_y, + int num_rows, + int num_cols, + int num_nonzeros, + cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV"); @@ -204,7 +205,9 @@ struct DeviceSpmv spmv_params.alpha = ValueT{1}; spmv_params.beta = ValueT{0}; + _CCCL_SUPPRESS_DEPRECATED_PUSH return DispatchSpmv::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); + _CCCL_SUPPRESS_DEPRECATED_POP } //! @} end member group diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index a8c733ef309..e717277e520 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -51,6 +51,9 @@ CUB_NAMESPACE_BEGIN +namespace detail::adjacent_difference +{ + template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile) @@ -78,7 +81,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( // It is OK to introspect the return type or parameter types of the // `operator()` function of `__device__` extended lambda within device code. - using OutputT = detail::invoke_result_t; + using OutputT = invoke_result_t; using Agent = AgentDifference; + using AgentDifferenceInitT = + detail::adjacent_difference::AgentDifferenceInit; constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS; const int init_grid_size = ::cuda::ceil_div(num_tiles, init_block_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceAdjacentDifferenceInitKernel" "<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_block_size, reinterpret_cast(stream)); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream) - .doit(DeviceAdjacentDifferenceInitKernel, + .doit(detail::adjacent_difference:: + DeviceAdjacentDifferenceInitKernel, d_input, first_tile_previous, num_tiles, @@ -219,17 +226,17 @@ struct DispatchAdjacentDifference } } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel" "<<<%d, %d, 0, %lld>>>()\n", num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, reinterpret_cast(stream)); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) - .doit(DeviceAdjacentDifferenceDifferenceKernel< + .doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel< typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 46ff7cbced6..c870221f3e1 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -54,6 +54,8 @@ #include +#include +#include #include #include @@ -62,6 +64,8 @@ CUB_NAMESPACE_BEGIN namespace detail { +namespace batch_memcpy +{ /** * Initialization kernel for tile status initialization (multi-block) */ @@ -100,15 +104,13 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO { using StatusWord = typename TileT::StatusWord; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT; - using BufferSizeT = cub::detail::value_t; + using BufferSizeT = value_t; /// Internal load/store type. For byte-wise memcpy, a single-byte type - using AliasT = - typename ::cuda::std::conditional, - std::iterator_traits>>::type::value_type; + using AliasT = typename ::cuda::std:: + conditional, std::iterator_traits>>::type::value_type; /// Types of the input and output buffers - using InputBufferT = cub::detail::value_t; - using OutputBufferT = cub::detail::value_t; + using InputBufferT = value_t; + using OutputBufferT = value_t; constexpr uint32_t BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS; constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD; @@ -131,7 +133,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO // Make sure thread 0 does not overwrite the buffer id before other threads have finished with // the prior iteration of the loop - CTA_SYNC(); + __syncthreads(); // Binary search the buffer that this tile belongs to if (threadIdx.x == 0) @@ -140,7 +142,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO } // Make sure thread 0 has written the buffer this thread block is assigned to - CTA_SYNC(); + __syncthreads(); const BufferOffsetT buffer_id = block_buffer_id; @@ -173,7 +175,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO copy_items( input_buffer_it[buffer_id], output_buffer_it[buffer_id], - (cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE), + (::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE), tile_offset_within_buffer); } @@ -229,7 +231,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO BLevBlockOffsetTileState blev_block_scan_state) { // Internal type used for storing a buffer's size - using BufferSizeT = cub::detail::value_t; + using BufferSizeT = value_t; // Alias the correct tuning policy for the current compilation pass' architecture using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT; @@ -268,6 +270,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO blev_block_scan_state) .ConsumeTile(blockIdx.x); } +} // namespace batch_memcpy /** * @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers @@ -462,8 +465,8 @@ struct DispatchBatchMemcpy // Kernels auto init_scan_states_kernel = - InitTileStateKernel; - auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel< + detail::batch_memcpy::InitTileStateKernel; + auto batch_memcpy_non_blev_kernel = detail::batch_memcpy::BatchMemcpyKernel< typename PolicyHub::MaxPolicy, InputBufferIt, OutputBufferIt, @@ -478,7 +481,7 @@ struct DispatchBatchMemcpy BLevBlockOffsetTileState, IsMemcpy>; - auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel< + auto multi_block_memcpy_kernel = detail::batch_memcpy::MultiBlockBatchMemcpyKernel< typename PolicyHub::MaxPolicy, BufferOffsetT, BlevBufferSrcsOutItT, @@ -536,7 +539,7 @@ struct DispatchBatchMemcpy return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(init_grid_size), @@ -564,7 +567,7 @@ struct DispatchBatchMemcpy return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(batch_memcpy_grid_size), @@ -603,7 +606,7 @@ struct DispatchBatchMemcpy return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(batch_memcpy_blev_grid_size), diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index 7ba478e3c00..895ac9821fb 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -51,10 +51,7 @@ CUB_NAMESPACE_BEGIN -namespace detail -{ - -namespace for_each +namespace detail::for_each { // The dispatch layer is in the detail namespace until we figure out tuning API @@ -101,7 +98,7 @@ struct dispatch_t const auto tile_size = static_cast(block_threads * items_per_thread); const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread\n", static_cast(num_tiles), @@ -144,7 +141,7 @@ struct dispatch_t const auto tile_size = static_cast(block_threads * items_per_thread); const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread\n", static_cast(num_tiles), @@ -195,8 +192,6 @@ struct dispatch_t } }; -} // namespace for_each - -} // namespace detail +} // namespace detail::for_each CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh index 6e346316d48..a4770a1b98c 100644 --- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh +++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh @@ -73,9 +73,7 @@ CUB_NAMESPACE_BEGIN -namespace detail -{ -namespace for_each_in_extents +namespace detail::for_each_in_extents { // The dispatch layer is in the detail namespace until we figure out the tuning API @@ -117,7 +115,7 @@ public: constexpr unsigned items_per_thread = ActivePolicyT::for_policy_t::items_per_thread; unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread); -# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +# ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each_in_extents::static_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n", num_cta, block_threads, @@ -155,7 +153,7 @@ public: _CUB_RETURN_IF_ERROR(status) unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread); -# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +# ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each_in_extents::dynamic_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n", num_cta, block_threads, @@ -203,8 +201,7 @@ private: unsigned_index_type _size; }; -} // namespace for_each_in_extents -} // namespace detail +} // namespace detail::for_each_in_extents CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 900f758cdfb..2ac4e160220 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -1,4 +1,3 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. @@ -98,6 +97,9 @@ CUB_NAMESPACE_BEGIN * @param tile_queue * Drain queue descriptor for dynamically mapping tile data onto thread blocks */ +namespace detail::histogram +{ + template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramInitKernel( ::cuda::std::array num_output_bins_wrapper, @@ -254,9 +256,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK agent.StoreOutput(); } -namespace detail -{ - template >>()\n", histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke histogram_init_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -453,7 +452,7 @@ struct dispatch_histogram } // Log histogram_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels " "per thread, %d SM occupancy\n", sweep_grid_dims.x, @@ -463,7 +462,7 @@ struct dispatch_histogram (long long) stream, pixels_per_thread, histogram_sweep_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke histogram_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream) @@ -503,20 +502,21 @@ struct dispatch_histogram CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { return Invoke( - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel); + detail::histogram::DeviceHistogramInitKernel, + detail::histogram::DeviceHistogramSweepKernel< + MaxPolicyT, + PRIVATIZED_SMEM_BINS, + NUM_CHANNELS, + NUM_ACTIVE_CHANNELS, + SampleIteratorT, + CounterT, + PrivatizedDecodeOpT, + OutputDecodeOpT, + OffsetT>); } }; -} // namespace detail +} // namespace detail::histogram /****************************************************************************** * Dispatch @@ -959,7 +959,7 @@ public: // Too many bins to keep in shared memory. constexpr int PRIVATIZED_SMEM_BINS = 0; - detail::dispatch_histogram< + detail::histogram::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, @@ -995,7 +995,7 @@ public: // Dispatch shared-privatized approach constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; - detail::dispatch_histogram< + detail::histogram::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, @@ -1129,7 +1129,7 @@ public: constexpr int PRIVATIZED_SMEM_BINS = 256; - detail::dispatch_histogram< + detail::histogram::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, @@ -1277,7 +1277,7 @@ public: // Dispatch shared-privatized approach constexpr int PRIVATIZED_SMEM_BINS = 0; - detail::dispatch_histogram< + detail::histogram::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, @@ -1313,7 +1313,7 @@ public: // Dispatch shared-privatized approach constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; - detail::dispatch_histogram< + detail::histogram::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, @@ -1451,7 +1451,7 @@ public: constexpr int PRIVATIZED_SMEM_BINS = 256; - detail::dispatch_histogram< + detail::histogram::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index ff43656c5c5..b3d0c8ab2ca 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -21,10 +21,11 @@ #include +#include +#include + CUB_NAMESPACE_BEGIN -namespace detail -{ -namespace merge +namespace detail::merge { _CCCL_INLINE_VAR constexpr int fallback_BLOCK_THREADS = 64; _CCCL_INLINE_VAR constexpr int fallback_ITEMS_PER_THREAD = 1; @@ -80,7 +81,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel( const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x; if (partition_idx < num_partitions) { - const Offset partition_at = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count); + const Offset partition_at = (::cuda::std::min)(partition_idx * items_per_tile, keys1_count + keys2_count); merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op); } } @@ -300,6 +301,5 @@ struct dispatch_t return cudaSuccess; } }; -} // namespace merge -} // namespace detail +} // namespace detail::merge CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 1d455bdfbf1..e8cc91e8420 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -47,11 +47,13 @@ #include #include +#include +#include #include CUB_NAMESPACE_BEGIN -namespace detail +namespace detail::merge_sort { /** @@ -95,7 +97,7 @@ private: using block_sort_helper_t = dual_policy_agent_helper_t< DefaultPolicyT, fallback_policy_t, - AgentBlockSort, + merge_sort::AgentBlockSort, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, @@ -112,7 +114,7 @@ private: using merge_helper_t = dual_policy_agent_helper_t< DefaultPolicyT, fallback_policy_t, - AgentMerge, + merge_sort::AgentMerge, KeyIteratorT, ValueIteratorT, OffsetT, @@ -125,9 +127,10 @@ private: // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the // available shared memory - static constexpr auto max_default_size = (cub::max)(block_sort_helper_t::default_size, merge_helper_t::default_size); + static constexpr auto max_default_size = + (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size); static constexpr auto max_fallback_size = - (cub::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size); + (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size); static constexpr bool uses_fallback_policy = (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block); @@ -137,8 +140,6 @@ public: ::cuda::std::_If; using merge_agent_t = ::cuda::std::_If; }; -} // namespace detail - template __launch_bounds__( - cub::detail::merge_sort_vsmem_helper_t< - typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>::policy_t::BLOCK_THREADS) + merge_sort_vsmem_helper_t::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel( bool ping, KeyInputIteratorT keys_in, @@ -169,9 +169,9 @@ __launch_bounds__( KeyT* tmp_keys_out, ValueT* tmp_items_out, CompareOpT compare_op, - cub::detail::vsmem_t vsmem) + vsmem_t vsmem) { - using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t< + using MergeSortHelperT = merge_sort_vsmem_helper_t< typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, KeyInputIteratorT, ValueInputIteratorT, @@ -186,7 +186,7 @@ __launch_bounds__( using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t; - using VSmemHelperT = cub::detail::vsmem_helper_impl; + using VSmemHelperT = vsmem_helper_impl; // Static shared memory allocation __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; @@ -256,16 +256,15 @@ template __launch_bounds__( - cub::detail::merge_sort_vsmem_helper_t< - typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>::policy_t::BLOCK_THREADS) + merge_sort_vsmem_helper_t::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel( bool ping, KeyIteratorT keys_ping, @@ -276,9 +275,9 @@ __launch_bounds__( CompareOpT compare_op, OffsetT* merge_partitions, OffsetT target_merged_tiles_number, - cub::detail::vsmem_t vsmem) + vsmem_t vsmem) { - using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t< + using MergeSortHelperT = merge_sort_vsmem_helper_t< typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, KeyInputIteratorT, ValueInputIteratorT, @@ -293,7 +292,7 @@ __launch_bounds__( using AgentMergeT = typename MergeSortHelperT::merge_agent_t; - using VSmemHelperT = cub::detail::vsmem_helper_impl; + using VSmemHelperT = vsmem_helper_impl; // Static shared memory allocation __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; @@ -323,6 +322,8 @@ __launch_bounds__( VSmemHelperT::discard_temp_storage(temp_storage); } +} // namespace detail::merge_sort + /******************************************************************************* * Policy ******************************************************************************/ @@ -405,7 +406,7 @@ struct DispatchMergeSort { using MergePolicyT = typename ActivePolicyT::MergeSortPolicy; - using merge_sort_helper_t = cub::detail::merge_sort_vsmem_helper_t< + using merge_sort_helper_t = detail::merge_sort::merge_sort_vsmem_helper_t< MergePolicyT, KeyInputIteratorT, ValueInputIteratorT, @@ -416,8 +417,8 @@ struct DispatchMergeSort KeyT, ValueT>; - using BlockSortVSmemHelperT = cub::detail::vsmem_helper_impl; - using MergeAgentVSmemHelperT = cub::detail::vsmem_helper_impl; + using BlockSortVSmemHelperT = detail::vsmem_helper_impl; + using MergeAgentVSmemHelperT = detail::vsmem_helper_impl; cudaError error = cudaSuccess; @@ -445,7 +446,7 @@ struct DispatchMergeSort */ const std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block; const std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block; - const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size); + const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size); void* allocations[4] = {nullptr, nullptr, nullptr, nullptr}; std::size_t allocation_sizes[4] = { @@ -486,7 +487,7 @@ struct DispatchMergeSort THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true) .doit( - DeviceMergeSortBlockSortKernel< + detail::merge_sort::DeviceMergeSortBlockSortKernel< typename PolicyHub::MaxPolicy, KeyInputIteratorT, ValueInputIteratorT, @@ -544,7 +545,7 @@ struct DispatchMergeSort // Partition THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( partition_grid_size, threads_per_partition_block, 0, stream, true) - .doit(DeviceMergeSortPartitionKernel, + .doit(detail::merge_sort::DeviceMergeSortPartitionKernel, ping, d_output_keys, keys_buffer, @@ -572,15 +573,16 @@ struct DispatchMergeSort THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), static_cast(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true) .doit( - DeviceMergeSortMergeKernel, + detail::merge_sort::DeviceMergeSortMergeKernel< + typename PolicyHub::MaxPolicy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>, ping, d_output_keys, d_output_items, diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index 0d4d9bf1ea9..18bbd99d00d 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -77,6 +77,9 @@ CUB_NAMESPACE_BEGIN * Kernel entry points *****************************************************************************/ +namespace detail::radix_sort +{ + /** * @brief Upsweep digit-counting kernel entry point (multi-block). * Computes privatized digit histograms, one per block. @@ -149,7 +152,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp }; // Parameterize AgentRadixSortUpsweep type for the current configuration - using AgentRadixSortUpsweepT = AgentRadixSortUpsweep; + using AgentRadixSortUpsweepT = + detail::radix_sort::AgentRadixSortUpsweep; // Shared memory storage __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; @@ -161,7 +165,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); - CTA_SYNC(); + __syncthreads(); // Write out digit counts (striped) upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); @@ -190,13 +194,13 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), { // Parameterize the AgentScan type for the current configuration using AgentScanT = - AgentScan, - OffsetT, - OffsetT, - OffsetT>; + detail::scan::AgentScan, + OffsetT, + OffsetT, + OffsetT>; // Shared memory storage __shared__ typename AgentScanT::TempStorage temp_storage; @@ -308,7 +312,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo }; // Parameterize AgentRadixSortDownsweep type for the current configuration - using AgentRadixSortDownsweepT = + using AgentRadixSortDownsweepT = detail::radix_sort:: AgentRadixSortDownsweep; // Shared memory storage @@ -432,7 +436,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE // Load keys BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); - CTA_SYNC(); + __syncthreads(); // Load values if (!KEYS_ONLY) @@ -443,7 +447,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); - CTA_SYNC(); + __syncthreads(); } // Sort tile @@ -570,13 +574,14 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen }; // Upsweep type - using BlockUpsweepT = AgentRadixSortUpsweep; + using BlockUpsweepT = detail::radix_sort::AgentRadixSortUpsweep; // Digit-scan type using DigitScanT = BlockScan; // Downsweep type - using BlockDownsweepT = AgentRadixSortDownsweep; + using BlockDownsweepT = + detail::radix_sort::AgentRadixSortDownsweep; enum { @@ -616,13 +621,13 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer); upsweep.ProcessRegion(segment_begin, segment_end); - CTA_SYNC(); + __syncthreads(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); - CTA_SYNC(); + __syncthreads(); if (IS_DESCENDING) { @@ -638,7 +643,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -677,7 +682,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen } } - CTA_SYNC(); + __syncthreads(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) @@ -691,7 +696,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen } } - CTA_SYNC(); + __syncthreads(); // Downsweep BlockDownsweepT downsweep( @@ -729,7 +734,8 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {}) { using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy; - using AgentT = AgentRadixSortHistogram; + using AgentT = + detail::radix_sort::AgentRadixSortHistogram; __shared__ typename AgentT::TempStorage temp_storage; AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit, decomposer); agent.Process(); @@ -759,7 +765,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy DecomposerT decomposer = {}) { using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy; - using AgentT = + using AgentT = detail::radix_sort:: AgentRadixSortOnesweep; __shared__ typename AgentT::TempStorage s; @@ -824,6 +830,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_b } } +} // namespace detail::radix_sort + /****************************************************************************** * Single-problem dispatch ******************************************************************************/ @@ -965,7 +973,7 @@ struct DispatchRadixSort } // Log single_tile_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit " "%d, bit_grain %d\n", 1, @@ -1036,7 +1044,7 @@ struct DispatchRadixSort int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log upsweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, " "bit_grain %d\n", pass_config.even_share.grid_size, @@ -1078,7 +1086,7 @@ struct DispatchRadixSort } // Log scan_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", 1, pass_config.scan_config.block_threads, @@ -1105,7 +1113,7 @@ struct DispatchRadixSort } // Log downsweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, @@ -1295,7 +1303,8 @@ struct DispatchRadixSort constexpr int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS; int histo_blocks_per_sm = 1; - auto histogram_kernel = DeviceRadixSortHistogramKernel; + auto histogram_kernel = + detail::radix_sort::DeviceRadixSortHistogramKernel; error = CubDebug( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0)); @@ -1305,7 +1314,7 @@ struct DispatchRadixSort } // log histogram_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, " "%d SM occupancy, bit_grain %d\n", histo_blocks_per_sm * num_sms, @@ -1335,7 +1344,7 @@ struct DispatchRadixSort constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; // log exclusive_sum_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n", num_passes, SCAN_BLOCK_THREADS, @@ -1344,7 +1353,7 @@ struct DispatchRadixSort #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream) - .doit(DeviceRadixSortExclusiveSumKernel, d_bins); + .doit(detail::radix_sort::DeviceRadixSortExclusiveSumKernel, d_bins); error = CubDebug(error); if (cudaSuccess != error) { @@ -1383,7 +1392,7 @@ struct DispatchRadixSort } // log onesweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, " "current bit %d, bit_grain %d, portion %d/%d\n", num_blocks, @@ -1396,7 +1405,7 @@ struct DispatchRadixSort static_cast(num_portions)); #endif - auto onesweep_kernel = DeviceRadixSortOnesweepKernel< + auto onesweep_kernel = detail::radix_sort::DeviceRadixSortOnesweepKernel< max_policy_t, IS_DESCENDING, KeyT, @@ -1647,11 +1656,13 @@ struct DispatchRadixSort { // Invoke upsweep-downsweep return InvokePasses( - DeviceRadixSortUpsweepKernel, - DeviceRadixSortUpsweepKernel, - RadixSortScanBinsKernel, - DeviceRadixSortDownsweepKernel, - DeviceRadixSortDownsweepKernel); + detail::radix_sort::DeviceRadixSortUpsweepKernel, + detail::radix_sort::DeviceRadixSortUpsweepKernel, + detail::radix_sort::RadixSortScanBinsKernel, + detail::radix_sort:: + DeviceRadixSortDownsweepKernel, + detail::radix_sort:: + DeviceRadixSortDownsweepKernel); } template @@ -1672,7 +1683,7 @@ struct DispatchRadixSort } // Copy keys -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream); #endif cudaError_t error = cudaSuccess; @@ -1694,7 +1705,7 @@ struct DispatchRadixSort // Copy values if necessary if (!KEYS_ONLY) { -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream); #endif error = CubDebug(cudaMemcpyAsync( @@ -1751,7 +1762,8 @@ struct DispatchRadixSort { // Small, single tile size return InvokeSingleTile( - DeviceRadixSortSingleTileKernel); + detail::radix_sort:: + DeviceRadixSortSingleTileKernel); } else { @@ -2001,7 +2013,7 @@ struct DispatchSegmentedRadixSort int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), " "%lld items per thread, %lld SM occupancy, " "current bit %d, bit_grain %d\n", @@ -2223,7 +2235,7 @@ struct DispatchSegmentedRadixSort // Force kernel code-generation in all compiler passes return InvokePasses( - DeviceSegmentedRadixSortKernel< + detail::radix_sort::DeviceSegmentedRadixSortKernel< max_policy_t, false, IS_DESCENDING, @@ -2233,7 +2245,7 @@ struct DispatchSegmentedRadixSort EndOffsetIteratorT, OffsetT, DecomposerT>, - DeviceSegmentedRadixSortKernel< + detail::radix_sort::DeviceSegmentedRadixSortKernel< max_policy_t, true, IS_DESCENDING, diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 0cca1e1a982..fee10767875 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -66,6 +66,9 @@ _CCCL_SUPPRESS_DEPRECATED_POP CUB_NAMESPACE_BEGIN +namespace detail::reduce +{ + /// Normalize input iterator to segment offset template _CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/) @@ -187,7 +190,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) if (threadIdx.x == 0) { - detail::reduce::finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate); + finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate); } } @@ -230,6 +233,7 @@ struct DeviceReduceKernelSource return sizeof(AccumT); } }; +} // namespace detail::reduce /****************************************************************************** * Single-problem dispatch @@ -263,7 +267,7 @@ template , InitT>, typename PolicyHub = detail::reduce::policy_hub, typename TransformOpT = ::cuda::std::__identity, - typename KernelSource = DeviceReduceKernelSource< + typename KernelSource = detail::reduce::DeviceReduceKernelSource< typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, @@ -378,13 +382,13 @@ struct DispatchReduce } // Log single_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", policy.SingleTile().BlockThreads(), (long long) stream, policy.SingleTile().ItemsPerThread()); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke single_reduce_sweep_kernel launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream) @@ -490,7 +494,7 @@ struct DispatchReduce int reduce_grid_size = even_share.grid_size; // Log device_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", (unsigned long) reduce_grid_size, @@ -498,7 +502,7 @@ struct DispatchReduce (long long) stream, active_policy.Reduce().ItemsPerThread(), reduce_config.sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke DeviceReduceKernel launcher_factory(reduce_grid_size, active_policy.Reduce().BlockThreads(), 0, stream) @@ -519,13 +523,13 @@ struct DispatchReduce } // Log single_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", active_policy.SingleTile().BlockThreads(), (long long) stream, active_policy.SingleTile().ItemsPerThread()); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke DeviceReduceSingleTileKernel launcher_factory(1, active_policy.SingleTile().BlockThreads(), 0, stream) @@ -698,7 +702,7 @@ template < typename AccumT = ::cuda::std:: __accumulator_t>, InitT>, typename PolicyHub = detail::reduce::policy_hub, - typename KernelSource = DeviceReduceKernelSource< + typename KernelSource = detail::reduce::DeviceReduceKernelSource< typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, @@ -881,7 +885,7 @@ struct DispatchSegmentedReduce } // Log device_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread, %d SM occupancy\n", num_segments, @@ -889,7 +893,7 @@ struct DispatchSegmentedReduce (long long) stream, ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, segmented_reduce_config.sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -920,7 +924,7 @@ struct DispatchSegmentedReduce { // Force kernel code-generation in all compiler passes return InvokePasses( - DeviceSegmentedReduceKernel< + detail::reduce::DeviceSegmentedReduceKernel< typename PolicyHub::MaxPolicy, InputIteratorT, OutputIteratorT, diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 804371588f3..d13a9c10b64 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -64,6 +64,9 @@ CUB_NAMESPACE_BEGIN * Kernel entry points *****************************************************************************/ +namespace detail::reduce +{ + /** * @brief Multi-block reduce-by-key sweep kernel entry point * @@ -176,6 +179,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH .ConsumeRange(num_items, tile_state, start_tile); } +} // namespace detail::reduce + /****************************************************************************** * Dispatch ******************************************************************************/ @@ -341,9 +346,9 @@ struct DispatchReduceByKey // Log init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -391,7 +396,7 @@ struct DispatchReduceByKey for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log reduce_by_key_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", start_tile, @@ -400,7 +405,7 @@ struct DispatchReduceByKey (long long) stream, items_per_thread, reduce_by_key_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke reduce_by_key_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) @@ -439,8 +444,8 @@ struct DispatchReduceByKey CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { return Invoke( - DeviceCompactInitKernel, - DeviceReduceByKeyKernel< + detail::scan::DeviceCompactInitKernel, + detail::reduce::DeviceReduceByKeyKernel< typename PolicyHub::MaxPolicy, KeysInputIteratorT, UniqueOutputIteratorT, diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index b1542462a58..697edc8f4e6 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -65,6 +65,9 @@ CUB_NAMESPACE_BEGIN * Kernel entry points *****************************************************************************/ +namespace detail::rle +{ + /** * Select kernel entry point (multi-block) * @@ -152,6 +155,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items) .ConsumeRange(num_tiles, tile_status, d_num_runs_out); } +} // namespace detail::rle /****************************************************************************** * Dispatch @@ -349,12 +353,12 @@ struct DeviceRleDispatch // Log device_scan_init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -405,7 +409,7 @@ struct DeviceRleDispatch scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log device_rle_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per " "thread, %d SM occupancy\n", scan_grid_size.x, @@ -415,7 +419,7 @@ struct DeviceRleDispatch (long long) stream, items_per_thread, device_rle_kernel_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke device_rle_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) @@ -451,15 +455,16 @@ struct DeviceRleDispatch CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { return Invoke( - DeviceCompactInitKernel, - DeviceRleSweepKernel); + detail::scan::DeviceCompactInitKernel, + detail::rle::DeviceRleSweepKernel< + typename PolicyHub::MaxPolicy, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + OffsetT>); } /** diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 4db31cf6989..2fdfe29ab7f 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -260,9 +260,9 @@ struct DispatchScan // Log init_kernel configuration int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -305,7 +305,7 @@ struct DispatchScan for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", start_tile, @@ -314,7 +314,7 @@ struct DispatchScan (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) @@ -345,16 +345,17 @@ struct DispatchScan using ScanTileStateT = typename cub::ScanTileState; // Ensure kernels are instantiated. return Invoke( - DeviceScanInitKernel, - DeviceScanKernel); + detail::scan::DeviceScanInitKernel, + detail::scan::DeviceScanKernel< + typename PolicyHub::MaxPolicy, + InputIteratorT, + OutputIteratorT, + ScanTileStateT, + ScanOpT, + InitValueT, + OffsetT, + AccumT, + ForceInclusive>); } /** diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index c88656dff48..9478543ab3b 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -63,6 +63,9 @@ CUB_NAMESPACE_BEGIN * Kernel entry points *****************************************************************************/ +namespace detail::scan_by_key +{ + /** * @brief Scan by key kernel entry point (multi-block) * @@ -150,16 +153,16 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT; // Thread block type for scanning input tiles - using AgentScanByKeyT = - AgentScanByKey; + using AgentScanByKeyT = detail::scan_by_key::AgentScanByKey< + ScanByKeyPolicyT, + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOp, + ScanOpT, + InitValueT, + OffsetT, + AccumT>; // Shared memory for AgentScanByKey __shared__ typename AgentScanByKeyT::TempStorage temp_storage; @@ -188,6 +191,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel( d_keys_prev_in[tid] = d_keys_in[tile_base - 1]; } } +} // namespace detail::scan_by_key /****************************************************************************** * Dispatch @@ -406,9 +410,9 @@ struct DispatchScanByKey // Log init_kernel configuration int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -441,7 +445,7 @@ struct DispatchScanByKey for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread\n", start_tile, @@ -449,7 +453,7 @@ struct DispatchScanByKey Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) @@ -489,17 +493,18 @@ struct DispatchScanByKey { // Ensure kernels are instantiated. return Invoke( - DeviceScanByKeyInitKernel, - DeviceScanByKeyKernel); + detail::scan_by_key::DeviceScanByKeyInitKernel, + detail::scan_by_key::DeviceScanByKeyKernel< + typename PolicyHub::MaxPolicy, + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + ScanByKeyTileStateT, + EqualityOp, + ScanOpT, + InitValueT, + OffsetT, + AccumT>); } /** diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 9d011d414ba..5690371d3fb 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -54,10 +54,14 @@ #include #include +#include #include #include #include +#include +#include +#include #include #include @@ -66,6 +70,44 @@ CUB_NAMESPACE_BEGIN +namespace detail::segmented_sort +{ +// Type used to index within segments within a single invocation +using local_segment_index_t = ::cuda::std::uint32_t; +// Type used for total number of segments and to index within segments globally +using global_segment_offset_t = ::cuda::std::int64_t; + +template +class OffsetIteratorT : public THRUST_NS_QUALIFIER::iterator_adaptor, Iterator> +{ +public: + using super_t = THRUST_NS_QUALIFIER::iterator_adaptor, Iterator>; + + OffsetIteratorT() = default; + + _CCCL_HOST_DEVICE OffsetIteratorT(const Iterator& it, OffsetItT offset_it) + : super_t(it) + , offset_it(offset_it) + {} + + // befriend thrust::iterator_core_access to allow it access to the private interface below + friend class THRUST_NS_QUALIFIER::iterator_core_access; + +private: + OffsetItT offset_it; + + _CCCL_HOST_DEVICE typename super_t::reference dereference() const + { + return *(this->base() + (*offset_it)); + } +}; + +template +_CCCL_HOST_DEVICE OffsetIteratorT make_offset_iterator(const Iterator& it, OffsetItT offset_it) +{ + return OffsetIteratorT{it, offset_it}; +} + /** * @brief Fallback kernel, in case there's not enough segments to * take advantage of partitioning. @@ -117,10 +159,10 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortFallbackKernel( const KeyT* d_keys_in_orig, KeyT* d_keys_out_orig, - cub::detail::device_double_buffer d_keys_double_buffer, + device_double_buffer d_keys_double_buffer, const ValueT* d_values_in_orig, ValueT* d_values_out_orig, - cub::detail::device_double_buffer d_values_double_buffer, + device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { @@ -128,10 +170,10 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using MediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT; - const unsigned int segment_id = blockIdx.x; - OffsetT segment_begin = d_begin_offsets[segment_id]; - OffsetT segment_end = d_end_offsets[segment_id]; - OffsetT num_items = segment_end - segment_begin; + const auto segment_id = static_cast(blockIdx.x); + OffsetT segment_begin = d_begin_offsets[segment_id]; + OffsetT segment_end = d_end_offsets[segment_id]; + OffsetT num_items = segment_end - segment_begin; if (num_items <= 0) { @@ -139,11 +181,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD } using AgentSegmentedRadixSortT = - cub::AgentSegmentedRadixSort; + radix_sort::AgentSegmentedRadixSort; using WarpReduceT = cub::WarpReduce; - using AgentWarpMergeSortT = AgentSubWarpSort; + using AgentWarpMergeSortT = + sub_warp_merge_sort::AgentSubWarpSort; __shared__ union { @@ -187,14 +230,14 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD { // Sort by a CTA with multiple reads from global memory int current_bit = begin_bit; - int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + int pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); - d_keys_double_buffer = cub::detail::device_double_buffer( + d_keys_double_buffer = device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); if (!keys_only) { - d_values_double_buffer = cub::detail::device_double_buffer( + d_values_double_buffer = device_double_buffer( d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin); } @@ -210,9 +253,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD #pragma unroll 1 while (current_bit < end_bit) { - pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); - CTA_SYNC(); + __syncthreads(); agent.ProcessIterative( current_bit, pass_bits, @@ -291,11 +334,11 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall( - unsigned int small_segments, - unsigned int medium_segments, - unsigned int medium_blocks, - const unsigned int* d_small_segments_indices, - const unsigned int* d_medium_segments_indices, + local_segment_index_t small_segments, + local_segment_index_t medium_segments, + local_segment_index_t medium_blocks, + const local_segment_index_t* d_small_segments_indices, + const local_segment_index_t* d_medium_segments_indices, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, @@ -303,25 +346,30 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { - const unsigned int tid = threadIdx.x; - const unsigned int bid = blockIdx.x; + using local_segment_index_t = local_segment_index_t; + + const local_segment_index_t tid = threadIdx.x; + const local_segment_index_t bid = blockIdx.x; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; using MediumPolicyT = typename SmallAndMediumPolicyT::MediumPolicyT; using SmallPolicyT = typename SmallAndMediumPolicyT::SmallPolicyT; - constexpr int threads_per_medium_segment = MediumPolicyT::WARP_THREADS; - constexpr int threads_per_small_segment = SmallPolicyT::WARP_THREADS; + constexpr auto threads_per_medium_segment = static_cast(MediumPolicyT::WARP_THREADS); + constexpr auto threads_per_small_segment = static_cast(SmallPolicyT::WARP_THREADS); - using MediumAgentWarpMergeSortT = AgentSubWarpSort; + using MediumAgentWarpMergeSortT = + sub_warp_merge_sort::AgentSubWarpSort; - using SmallAgentWarpMergeSortT = AgentSubWarpSort; + using SmallAgentWarpMergeSortT = + sub_warp_merge_sort::AgentSubWarpSort; constexpr auto segments_per_medium_block = - static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); + static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); - constexpr auto segments_per_small_block = static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); + constexpr auto segments_per_small_block = + static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); __shared__ union { @@ -332,12 +380,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic if (bid < medium_blocks) { - const unsigned int sid_within_block = tid / threads_per_medium_segment; - const unsigned int medium_segment_id = bid * segments_per_medium_block + sid_within_block; + const local_segment_index_t sid_within_block = tid / threads_per_medium_segment; + const local_segment_index_t medium_segment_id = bid * segments_per_medium_block + sid_within_block; if (medium_segment_id < medium_segments) { - const unsigned int global_segment_id = d_medium_segments_indices[medium_segment_id]; + const local_segment_index_t global_segment_id = d_medium_segments_indices[medium_segment_id]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; @@ -353,12 +401,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic } else { - const unsigned int sid_within_block = tid / threads_per_small_segment; - const unsigned int small_segment_id = (bid - medium_blocks) * segments_per_small_block + sid_within_block; + const local_segment_index_t sid_within_block = tid / threads_per_small_segment; + const local_segment_index_t small_segment_id = (bid - medium_blocks) * segments_per_small_block + sid_within_block; if (small_segment_id < small_segments) { - const unsigned int global_segment_id = d_small_segments_indices[small_segment_id]; + const local_segment_index_t global_segment_id = d_small_segments_indices[small_segment_id]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; @@ -410,35 +458,36 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge( - const unsigned int* d_segments_indices, + const local_segment_index_t* d_segments_indices, const KeyT* d_keys_in_orig, KeyT* d_keys_out_orig, - cub::detail::device_double_buffer d_keys_double_buffer, + device_double_buffer d_keys_double_buffer, const ValueT* d_values_in_orig, ValueT* d_values_out_orig, - cub::detail::device_double_buffer d_values_double_buffer, + device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { - using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; - using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; + using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; + using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; + using local_segment_index_t = local_segment_index_t; constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; using AgentSegmentedRadixSortT = - cub::AgentSegmentedRadixSort; + radix_sort::AgentSegmentedRadixSort; __shared__ typename AgentSegmentedRadixSortT::TempStorage storage; - const unsigned int bid = blockIdx.x; + const local_segment_index_t bid = blockIdx.x; constexpr int begin_bit = 0; constexpr int end_bit = sizeof(KeyT) * 8; - const unsigned int global_segment_id = d_segments_indices[bid]; - const OffsetT segment_begin = d_begin_offsets[global_segment_id]; - const OffsetT segment_end = d_end_offsets[global_segment_id]; - const OffsetT num_items = segment_end - segment_begin; + const local_segment_index_t global_segment_id = d_segments_indices[bid]; + const OffsetT segment_begin = d_begin_offsets[global_segment_id]; + const OffsetT segment_end = d_end_offsets[global_segment_id]; + const OffsetT num_items = segment_end - segment_begin; constexpr bool keys_only = std::is_same::value; AgentSegmentedRadixSortT agent(num_items, storage); @@ -461,14 +510,14 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD { // Sort reading global memory multiple times int current_bit = begin_bit; - int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + int pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); - d_keys_double_buffer = cub::detail::device_double_buffer( + d_keys_double_buffer = device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); if (!keys_only) { - d_values_double_buffer = cub::detail::device_double_buffer( + d_values_double_buffer = device_double_buffer( d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin); } @@ -484,9 +533,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD #pragma unroll 1 while (current_bit < end_bit) { - pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); - CTA_SYNC(); + __syncthreads(); agent.ProcessIterative( current_bit, pass_bits, @@ -522,33 +571,35 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont int num_segments, KeyT* d_current_keys, KeyT* d_final_keys, - detail::device_double_buffer d_keys_double_buffer, + device_double_buffer d_keys_double_buffer, ValueT* d_current_values, ValueT* d_final_values, - detail::device_double_buffer d_values_double_buffer, + device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, - unsigned int* group_sizes, - unsigned int* large_and_medium_segments_indices, - unsigned int* small_segments_indices, + local_segment_index_t* group_sizes, + local_segment_index_t* large_and_medium_segments_indices, + local_segment_index_t* small_segments_indices, cudaStream_t stream) { + using local_segment_index_t = local_segment_index_t; + cudaError error = cudaSuccess; - const unsigned int large_segments = group_sizes[0]; + const local_segment_index_t large_segments = group_sizes[0]; if (large_segments > 0) { // One CTA per segment - const unsigned int blocks_in_grid = large_segments; + const local_segment_index_t blocks_in_grid = large_segments; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n", static_cast(blocks_in_grid), LargeSegmentPolicyT::BLOCK_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream) @@ -571,32 +622,34 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont } // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); + error = CubDebug(DebugSyncStream(stream)); if (cudaSuccess != error) { return error; } } - const unsigned int small_segments = group_sizes[1]; - const unsigned int medium_segments = static_cast(num_segments) - (large_segments + small_segments); + const local_segment_index_t small_segments = group_sizes[1]; + const local_segment_index_t medium_segments = + static_cast(num_segments) - (large_segments + small_segments); - const unsigned int small_blocks = ::cuda::ceil_div(small_segments, SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); + const local_segment_index_t small_blocks = + ::cuda::ceil_div(small_segments, SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); - const unsigned int medium_blocks = + const local_segment_index_t medium_blocks = ::cuda::ceil_div(medium_segments, SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); - const unsigned int small_and_medium_blocks_in_grid = small_blocks + medium_blocks; + const local_segment_index_t small_and_medium_blocks_in_grid = small_blocks + medium_blocks; if (small_and_medium_blocks_in_grid) { -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n", static_cast(small_and_medium_blocks_in_grid), SmallAndMediumPolicyT::BLOCK_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream) @@ -621,7 +674,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont } // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); + error = CubDebug(DebugSyncStream(stream)); if (cudaSuccess != error) { return error; @@ -646,18 +699,18 @@ template d_keys_double_buffer, + device_double_buffer d_keys_double_buffer, ValueT* d_current_values, ValueT* d_final_values, - detail::device_double_buffer d_values_double_buffer, + device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, - unsigned int* group_sizes, - unsigned int* large_and_medium_segments_indices, - unsigned int* small_segments_indices) + local_segment_index_t* group_sizes, + local_segment_index_t* large_and_medium_segments_indices, + local_segment_index_t* small_segments_indices) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; @@ -672,27 +725,30 @@ __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContin // // Due to (4, 5), we can't pass the user-provided stream in the continuation. // Due to (1, 2, 3) it's safe to pass the main stream. - cudaError_t error = DeviceSegmentedSortContinuation( - large_kernel, - small_kernel, - num_segments, - d_current_keys, - d_final_keys, - d_keys_double_buffer, - d_current_values, - d_final_values, - d_values_double_buffer, - d_begin_offsets, - d_end_offsets, - group_sizes, - large_and_medium_segments_indices, - small_segments_indices, - 0); // always launching on the main stream (see motivation above) + cudaError_t error = + detail::segmented_sort::DeviceSegmentedSortContinuation( + large_kernel, + small_kernel, + num_segments, + d_current_keys, + d_final_keys, + d_keys_double_buffer, + d_current_values, + d_final_values, + d_values_double_buffer, + d_begin_offsets, + d_end_offsets, + group_sizes, + large_and_medium_segments_indices, + small_segments_indices, + 0); // always launching on the main stream (see motivation above) error = CubDebug(error); } #endif // CUB_RDC_ENABLED +} // namespace detail::segmented_sort + template > struct DispatchSegmentedSort { + using local_segment_index_t = detail::segmented_sort::local_segment_index_t; + using global_segment_offset_t = detail::segmented_sort::global_segment_offset_t; + + using StreamingBeginOffsetIteratorT = + detail::segmented_sort::OffsetIteratorT>; + using StreamingEndOffsetIteratorT = + detail::segmented_sort::OffsetIteratorT>; + static constexpr int KEYS_ONLY = std::is_same::value; struct LargeSegmentsSelectorT @@ -709,6 +775,7 @@ struct DispatchSegmentedSort OffsetT value{}; BeginOffsetIteratorT d_offset_begin{}; EndOffsetIteratorT d_offset_end{}; + global_segment_offset_t base_segment_offset{}; _CCCL_HOST_DEVICE _CCCL_FORCEINLINE LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end) @@ -717,9 +784,10 @@ struct DispatchSegmentedSort , d_offset_end(d_offset_end) {} - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const { - const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id]; + const OffsetT segment_size = + d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id]; return segment_size > value; } }; @@ -729,6 +797,7 @@ struct DispatchSegmentedSort OffsetT value{}; BeginOffsetIteratorT d_offset_begin{}; EndOffsetIteratorT d_offset_end{}; + global_segment_offset_t base_segment_offset{}; _CCCL_HOST_DEVICE _CCCL_FORCEINLINE SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end) @@ -737,9 +806,10 @@ struct DispatchSegmentedSort , d_offset_end(d_offset_end) {} - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const { - const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id]; + const OffsetT segment_size = + d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id]; return segment_size < value; } }; @@ -770,10 +840,10 @@ struct DispatchSegmentedSort DoubleBuffer& d_values; /// Number of items to sort - OffsetT num_items; + ::cuda::std::int64_t num_items; /// The number of segments that comprise the sorting data - int num_segments; + global_segment_offset_t num_segments; /** * Random-access input iterator to the sequence of beginning offsets of length @@ -802,8 +872,8 @@ struct DispatchSegmentedSort std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - OffsetT num_items, - int num_segments, + ::cuda::std::int64_t num_items, + global_segment_offset_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, @@ -871,9 +941,10 @@ struct DispatchSegmentedSort } } - auto large_and_medium_segments_indices = large_and_medium_partitioning_slot->create_alias(); - auto small_segments_indices = small_partitioning_slot->create_alias(); - auto group_sizes = group_sizes_slot->create_alias(); + auto large_and_medium_segments_indices = + large_and_medium_partitioning_slot->create_alias(); + auto small_segments_indices = small_partitioning_slot->create_alias(); + auto group_sizes = group_sizes_slot->create_alias(); std::size_t three_way_partition_temp_storage_bytes{}; @@ -887,8 +958,13 @@ struct DispatchSegmentedSort if (partition_segments) { - large_and_medium_segments_indices.grow(num_segments); - small_segments_indices.grow(num_segments); + constexpr auto num_segments_per_invocation_limit = + static_cast(::cuda::std::numeric_limits::max()); + auto const max_num_segments_per_invocation = static_cast( + (::cuda::std::min)(static_cast(num_segments), num_segments_per_invocation_limit)); + + large_and_medium_segments_indices.grow(max_num_segments_per_invocation); + small_segments_indices.grow(max_num_segments_per_invocation); group_sizes.grow(num_selected_groups); auto medium_indices_iterator = @@ -897,12 +973,12 @@ struct DispatchSegmentedSort cub::DevicePartition::IfNoNVTX( nullptr, three_way_partition_temp_storage_bytes, - THRUST_NS_QUALIFIER::counting_iterator(0), + THRUST_NS_QUALIFIER::counting_iterator(0), large_and_medium_segments_indices.get(), small_segments_indices.get(), medium_indices_iterator, group_sizes.get(), - num_segments, + max_num_segments_per_invocation, large_segments_selector, small_segments_selector, stream); @@ -1002,20 +1078,22 @@ struct DispatchSegmentedSort // Partition input segments into size groups and assign specialized // kernels for each of them. error = SortWithPartitioning( - DeviceSegmentedSortKernelLarge, - DeviceSegmentedSortKernelSmall, + detail::segmented_sort::DeviceSegmentedSortKernelLarge< + IS_DESCENDING, + MaxPolicyT, + KeyT, + ValueT, + StreamingBeginOffsetIteratorT, + StreamingEndOffsetIteratorT, + OffsetT>, + detail::segmented_sort::DeviceSegmentedSortKernelSmall< + IS_DESCENDING, + MaxPolicyT, + KeyT, + ValueT, + StreamingBeginOffsetIteratorT, + StreamingEndOffsetIteratorT, + OffsetT>, three_way_partition_temp_storage_bytes, d_keys_double_buffer, d_values_double_buffer, @@ -1032,13 +1110,14 @@ struct DispatchSegmentedSort // on extra partitioning steps. error = SortWithoutPartitioning( - DeviceSegmentedSortFallbackKernel, + detail::segmented_sort::DeviceSegmentedSortFallbackKernel< + IS_DESCENDING, + MaxPolicyT, + KeyT, + ValueT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT>, d_keys_double_buffer, d_values_double_buffer); } @@ -1056,8 +1135,8 @@ struct DispatchSegmentedSort std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, - OffsetT num_items, - int num_segments, + ::cuda::std::int64_t num_items, + global_segment_offset_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, @@ -1136,35 +1215,56 @@ private: LargeSegmentsSelectorT& large_segments_selector, SmallSegmentsSelectorT& small_segments_selector, cub::detail::temporary_storage::alias& device_partition_temp_storage, - cub::detail::temporary_storage::alias& large_and_medium_segments_indices, - cub::detail::temporary_storage::alias& small_segments_indices, - cub::detail::temporary_storage::alias& group_sizes) + cub::detail::temporary_storage::alias& large_and_medium_segments_indices, + cub::detail::temporary_storage::alias& small_segments_indices, + cub::detail::temporary_storage::alias& group_sizes) { cudaError_t error = cudaSuccess; - auto medium_indices_iterator = - THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get() + num_segments); - - error = CubDebug(cub::DevicePartition::IfNoNVTX( - device_partition_temp_storage.get(), - three_way_partition_temp_storage_bytes, - THRUST_NS_QUALIFIER::counting_iterator(0), - large_and_medium_segments_indices.get(), - small_segments_indices.get(), - medium_indices_iterator, - group_sizes.get(), - num_segments, - large_segments_selector, - small_segments_selector, - stream)); - if (cudaSuccess != error) + constexpr global_segment_offset_t num_segments_per_invocation_limit = + static_cast(::cuda::std::numeric_limits::max()); + + // We repeatedly invoke the partitioning and sorting kernels until all segments are processed. + const global_segment_offset_t num_invocations = + ::cuda::ceil_div(static_cast(num_segments), num_segments_per_invocation_limit); + for (global_segment_offset_t invocation_index = 0; invocation_index < num_invocations; invocation_index++) { - return error; - } + const global_segment_offset_t current_seg_offset = invocation_index * num_segments_per_invocation_limit; + const local_segment_index_t current_num_segments = + (invocation_index == (num_invocations - 1)) + ? static_cast(num_segments - current_seg_offset) + : num_segments_per_invocation_limit; + + large_segments_selector.base_segment_offset = current_seg_offset; + small_segments_selector.base_segment_offset = current_seg_offset; + auto current_begin_offset = detail::segmented_sort::make_offset_iterator( + d_begin_offsets, THRUST_NS_QUALIFIER::constant_iterator{current_seg_offset}); + auto current_end_offset = detail::segmented_sort::make_offset_iterator( + d_end_offsets, THRUST_NS_QUALIFIER::constant_iterator{current_seg_offset}); + + auto medium_indices_iterator = + THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments); + + error = CubDebug(cub::DevicePartition::IfNoNVTX( + device_partition_temp_storage.get(), + three_way_partition_temp_storage_bytes, + THRUST_NS_QUALIFIER::counting_iterator(0), + large_and_medium_segments_indices.get(), + small_segments_indices.get(), + medium_indices_iterator, + group_sizes.get(), + current_num_segments, + large_segments_selector, + small_segments_selector, + stream)); + if (cudaSuccess != error) + { + return error; + } - // The device path is only used (and only compiles) when CDP is enabled. - // It's defined in a macro since we can't put `#ifdef`s inside of - // `NV_IF_TARGET`. + // The device path is only used (and only compiles) when CDP is enabled. + // It's defined in a macro since we can't put `#ifdef`s inside of + // `NV_IF_TARGET`. #ifndef CUB_RDC_ENABLED # define CUB_TEMP_DEVICE_CODE @@ -1175,25 +1275,25 @@ private: error = \ THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ .doit( \ - DeviceSegmentedSortContinuationKernel< \ + detail::segmented_sort::DeviceSegmentedSortContinuationKernel< \ typename PolicyHub::MaxPolicy, \ LargeKernelT, \ SmallKernelT, \ KeyT, \ ValueT, \ - BeginOffsetIteratorT, \ - EndOffsetIteratorT>, \ + StreamingBeginOffsetIteratorT, \ + StreamingEndOffsetIteratorT>, \ large_kernel, \ small_kernel, \ - num_segments, \ + current_num_segments, \ d_keys.Current(), \ GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ d_keys_double_buffer, \ d_values.Current(), \ GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ d_values_double_buffer, \ - d_begin_offsets, \ - d_end_offsets, \ + current_begin_offset, \ + current_end_offset, \ group_sizes.get(), \ large_and_medium_segments_indices.get(), \ small_segments_indices.get()); \ @@ -1212,16 +1312,16 @@ private: #endif // CUB_RDC_ENABLED - // Clang format mangles some of this NV_IF_TARGET block - // clang-format off + // Clang format mangles some of this NV_IF_TARGET block + // clang-format off NV_IF_TARGET( NV_IS_HOST, ( - unsigned int h_group_sizes[num_selected_groups]; + local_segment_index_t h_group_sizes[num_selected_groups]; error = CubDebug(cudaMemcpyAsync(h_group_sizes, group_sizes.get(), num_selected_groups * - sizeof(unsigned int), + sizeof(local_segment_index_t), cudaMemcpyDeviceToHost, stream)); @@ -1236,27 +1336,27 @@ private: return error; } - error = DeviceSegmentedSortContinuation( large_kernel, small_kernel, - num_segments, + current_num_segments, d_keys.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), d_keys_double_buffer, d_values.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), d_values_double_buffer, - d_begin_offsets, - d_end_offsets, + current_begin_offset, + current_end_offset, h_group_sizes, large_and_medium_segments_indices.get(), small_segments_indices.get(), stream);), // NV_IS_DEVICE: (CUB_TEMP_DEVICE_CODE)); - // clang-format on - + // clang-format on + } #undef CUB_TEMP_DEVICE_CODE return error; @@ -1270,11 +1370,11 @@ private: { cudaError_t error = cudaSuccess; - const auto blocks_in_grid = static_cast(num_segments); + const auto blocks_in_grid = static_cast(num_segments); constexpr auto threads_in_block = static_cast(LargeSegmentPolicyT::BLOCK_THREADS); // Log kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, " "0, %lld>>>(), %d items per thread, bit_grain %d\n", blocks_in_grid, @@ -1282,7 +1382,7 @@ private: (long long) stream, LargeSegmentPolicyT::ITEMS_PER_THREAD, LargeSegmentPolicyT::RADIX_BITS); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke fallback kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index c41dfb389eb..5c370c8b0c9 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -62,10 +62,7 @@ CUB_NAMESPACE_BEGIN -namespace detail -{ - -namespace select +namespace detail::select { // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition using per_partition_offset_t = ::cuda::std::int32_t; @@ -231,8 +228,6 @@ struct agent_select_if_wrapper_t MayAlias>::AgentSelectIf; }; }; -} // namespace select -} // namespace detail /****************************************************************************** * Kernel entry points @@ -329,9 +324,9 @@ template __launch_bounds__(int( - cub::detail::vsmem_helper_default_fallback_policy_t< + vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT, - detail::select::agent_select_if_wrapper_t::template agent_t, + agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -350,11 +345,11 @@ __launch_bounds__(int( OffsetT num_items, int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, - cub::detail::vsmem_t vsmem) + vsmem_t vsmem) { - using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< + using VsmemHelperT = vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT, - detail::select::agent_select_if_wrapper_t::template agent_t, + agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -381,6 +376,7 @@ __launch_bounds__(int( // If applicable, hints to discard modified cache lines for vsmem VsmemHelperT::discard_temp_storage(temp_storage); } +} // namespace detail::select /****************************************************************************** * Dispatch @@ -660,7 +656,7 @@ struct DispatchSelectIf // Log scan_init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, @@ -693,7 +689,7 @@ struct DispatchSelectIf } // Log select_if_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; @@ -756,8 +752,8 @@ struct DispatchSelectIf CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { return Invoke( - DeviceCompactInitKernel, - DeviceSelectSweepKernel< + detail::scan::DeviceCompactInitKernel, + detail::select::DeviceSelectSweepKernel< typename PolicyHub::MaxPolicy, InputIteratorT, FlagsInputIteratorT, diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index 6dc4f44aeca..24ef2845dee 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -83,8 +83,11 @@ CUB_NAMESPACE_BEGIN * @param[in] spmv_params * SpMV input parameter bundle */ +_CCCL_SUPPRESS_DEPRECATED_PUSH template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams spmv_params) +CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams spmv_params) // + _CCCL_SUPPRESS_DEPRECATED_POP { using VectorValueIteratorT = CacheModifiedInputIterator; @@ -132,8 +135,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams -CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params) +CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel( + int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params) { /// Constants enum @@ -217,6 +221,7 @@ template +CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") __launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel( SpmvParams spmv_params, CoordinateT* d_tile_coordinates, @@ -226,7 +231,9 @@ __launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES int num_segment_fixup_tiles) { // Spmv agent type specialization + _CCCL_SUPPRESS_DEPRECATED_PUSH using AgentSpmvT = AgentSpmv; + _CCCL_SUPPRESS_DEPRECATED_POP // Shared memory for AgentSpmv __shared__ typename AgentSpmvT::TempStorage temp_storage; @@ -248,6 +255,7 @@ __launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES * Whether the input parameter Beta is 0 */ template +CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) { const int row = static_cast(threadIdx.x + blockIdx.x * blockDim.x); @@ -298,18 +306,21 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams +CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") __launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel( PairsInputIteratorT d_pairs_in, AggregatesOutputIteratorT d_aggregates_out, OffsetT num_items, int num_tiles, - ScanTileStateT tile_state) + ScanTileStateT tile_state) // + _CCCL_SUPPRESS_DEPRECATED_POP { // Thread block type for reducing tiles of value segments using AgentSegmentFixupT = @@ -342,7 +353,7 @@ __launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) * Signed integer type for global offsets */ template -struct DispatchSpmv +struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv { //--------------------------------------------------------------------- // Constants and Types @@ -625,12 +636,12 @@ struct DispatchSpmv constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS; const int blocks_in_grid = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n", blocks_in_grid, threads_in_block, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(spmv_empty_matrix_kernel, spmv_params); @@ -662,12 +673,12 @@ struct DispatchSpmv int degen_col_kernel_block_size = INIT_KERNEL_THREADS; int degen_col_kernel_grid_size = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -789,12 +800,12 @@ struct DispatchSpmv // Use separate search kernel if we have enough spmv tiles to saturate the device // Log spmv_search_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", search_grid_size, search_block_size, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream) @@ -815,7 +826,7 @@ struct DispatchSpmv } // Log spmv_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", spmv_grid_size.x, spmv_grid_size.y, @@ -824,7 +835,7 @@ struct DispatchSpmv (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke spmv_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream) @@ -853,7 +864,7 @@ struct DispatchSpmv if (num_merge_tiles > 1) { // Log segment_fixup_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", segment_fixup_grid_size.x, segment_fixup_grid_size.y, @@ -862,7 +873,7 @@ struct DispatchSpmv (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke segment_fixup_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index d4af506a6d9..4114e583d01 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -30,9 +30,7 @@ _CCCL_SUPPRESS_DEPRECATED_POP CUB_NAMESPACE_BEGIN -namespace detail -{ -namespace reduce +namespace detail::reduce { template @@ -374,8 +372,7 @@ struct dispatch_streaming_arg_reduce_t } }; -} // namespace reduce -} // namespace detail +} // namespace detail::reduce CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 2d5566d76a3..c1320d59219 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -52,10 +52,7 @@ CUB_NAMESPACE_BEGIN -namespace detail -{ - -namespace three_way_partition +namespace detail::three_way_partition { // Offset type used to instantiate the stream three-way-partition-kernel and agent to index the items within one // partition @@ -131,8 +128,6 @@ public: } } }; -} // namespace three_way_partition -} // namespace detail /****************************************************************************** * Kernel entry points @@ -231,6 +226,7 @@ DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumS } } } +} // namespace detail::three_way_partition /****************************************************************************** * Dispatch @@ -319,7 +315,7 @@ struct DispatchThreeWayPartitionIf // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size) auto const max_partition_size = - static_cast(::cuda::std::min(static_cast(num_items), static_cast(partition_size))); + static_cast((::cuda::std::min)(static_cast(num_items), static_cast(partition_size))); // The number of partitions required to "iterate" over the total input auto const num_partitions = @@ -387,12 +383,12 @@ struct DispatchThreeWayPartitionIf // Log three_way_partition_init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, reinterpret_cast(stream)); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke three_way_partition_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -420,7 +416,7 @@ struct DispatchThreeWayPartitionIf } // Log select_if_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; @@ -440,7 +436,7 @@ struct DispatchThreeWayPartitionIf items_per_thread, range_select_sm_occupancy); } -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream) @@ -483,8 +479,8 @@ struct DispatchThreeWayPartitionIf { using MaxPolicyT = typename PolicyHub::MaxPolicy; return Invoke( - DeviceThreeWayPartitionInitKernel, - DeviceThreeWayPartitionKernel< + detail::three_way_partition::DeviceThreeWayPartitionInitKernel, + detail::three_way_partition::DeviceThreeWayPartitionKernel< MaxPolicyT, InputIteratorT, FirstOutputIteratorT, diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index fa4fa80d0ef..f35e89a133f 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -53,9 +53,7 @@ _CCCL_NV_DIAG_SUPPRESS(186) CUB_NAMESPACE_BEGIN -namespace detail -{ -namespace transform +namespace detail::transform { template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment) @@ -118,7 +116,7 @@ _CCCL_DEVICE void transform_kernel_impl( constexpr int block_dim = PrefetchPolicy::block_threads; const int tile_stride = block_dim * num_elem_per_thread; const Offset offset = static_cast(blockIdx.x) * tile_stride; - const int tile_size = static_cast(::cuda::std::min(num_items - offset, Offset{tile_stride})); + const int tile_size = static_cast((::cuda::std::min)(num_items - offset, Offset{tile_stride})); // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below { @@ -329,7 +327,7 @@ _CCCL_DEVICE void transform_kernel_ublkcp( constexpr int block_dim = BulkCopyPolicy::block_threads; const int tile_stride = block_dim * num_elem_per_thread; const Offset offset = static_cast(blockIdx.x) * tile_stride; - const int tile_size = ::cuda::std::min(num_items - offset, Offset{tile_stride}); + const int tile_size = (::cuda::std::min)(num_items - offset, Offset{tile_stride}); const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x; if (inner_blocks) @@ -688,10 +686,8 @@ struct dispatch_t(tile_size); if (smem_size > *max_smem) { -# ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS // assert should be prevented by smem check in policy - assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory"); -# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + _CCCL_ASSERT_HOST(last_counts.elem_per_thread > 0, "min_items_per_thread exceeds available shared memory"); return last_counts; } @@ -729,12 +725,10 @@ struct dispatch_telem_per_thread > 0); - assert(config->tile_size > 0); - assert(config->tile_size % bulk_copy_alignment == 0); - assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor -# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + _CCCL_ASSERT_HOST(config->elem_per_thread > 0, ""); + _CCCL_ASSERT_HOST(config->tile_size > 0, ""); + _CCCL_ASSERT_HOST(config->tile_size % bulk_copy_alignment == 0, ""); + _CCCL_ASSERT_HOST((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0), ""); // logical xor const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{config->tile_size})); return ::cuda::std::make_tuple( @@ -812,7 +806,7 @@ struct dispatch_t( - ::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy))); + (::cuda::std::min)(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy))); const int items_per_thread_clamped = ::cuda::std::clamp( items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread); @@ -862,6 +856,5 @@ struct dispatch_t __launch_bounds__(int( - cub::detail::vsmem_helper_default_fallback_policy_t< + vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT, AgentUniqueByKey, KeyInputIteratorT, @@ -145,9 +147,9 @@ __launch_bounds__(int( EqualityOpT equality_op, OffsetT num_items, int num_tiles, - cub::detail::vsmem_t vsmem) + vsmem_t vsmem) { - using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< + using VsmemHelperT = vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT, AgentUniqueByKey, KeyInputIteratorT, @@ -176,7 +178,7 @@ __launch_bounds__(int( // If applicable, hints to discard modified cache lines for vsmem VsmemHelperT::discard_temp_storage(temp_storage); } - +} // namespace detail::unique_by_key /****************************************************************************** * Dispatch ******************************************************************************/ @@ -333,7 +335,7 @@ struct DispatchUniqueByKey using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< Policy, - AgentUniqueByKey, + detail::unique_by_key::AgentUniqueByKey, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, @@ -396,9 +398,9 @@ struct DispatchUniqueByKey num_tiles = CUB_MAX(1, num_tiles); int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -439,7 +441,7 @@ struct DispatchUniqueByKey scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG { // Get SM occupancy for unique_by_key_kernel int scan_sm_occupancy; @@ -461,7 +463,7 @@ struct DispatchUniqueByKey items_per_thread, scan_sm_occupancy); } -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke select_if_kernel error = @@ -501,8 +503,8 @@ struct DispatchUniqueByKey { // Ensure kernels are instantiated. return Invoke( - DeviceCompactInitKernel, - DeviceUniqueByKeySweepKernel< + detail::scan::DeviceCompactInitKernel, + detail::unique_by_key::DeviceUniqueByKeySweepKernel< typename PolicyHub::MaxPolicy, KeyInputIteratorT, ValueInputIteratorT, diff --git a/cub/cub/device/dispatch/kernels/reduce.cuh b/cub/cub/device/dispatch/kernels/reduce.cuh index 2064d6f2a09..ca1ed19b529 100644 --- a/cub/cub/device/dispatch/kernels/reduce.cuh +++ b/cub/cub/device/dispatch/kernels/reduce.cuh @@ -103,8 +103,6 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_ { *d_out = block_aggregate; } -} // namespace reduce -} // namespace detail /** * @brief Reduce region kernel entry point (multi-block). Computes privatized @@ -161,14 +159,14 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) TransformOpT transform_op) { // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; + using AgentReduceT = detail::reduce::AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + AccumT*, + OffsetT, + ReductionOpT, + AccumT, + TransformOpT>; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; @@ -243,14 +241,14 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( TransformOpT transform_op) { // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; + using AgentReduceT = detail::reduce::AgentReduce< + typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + AccumT, + TransformOpT>; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; @@ -276,5 +274,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate); } } +} // namespace reduce +} // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/kernels/scan.cuh b/cub/cub/device/dispatch/kernels/scan.cuh index d38676a84b5..cc3034638bc 100644 --- a/cub/cub/device/dispatch/kernels/scan.cuh +++ b/cub/cub/device/dispatch/kernels/scan.cuh @@ -42,6 +42,11 @@ CUB_NAMESPACE_BEGIN +namespace detail +{ +namespace scan +{ + /****************************************************************************** * Kernel entry points *****************************************************************************/ @@ -169,7 +174,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT; // Thread block type for scanning input tiles - using AgentScanT = + using AgentScanT = detail::scan:: AgentScan; // Shared memory for AgentScan @@ -181,4 +186,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile); } +} // namespace scan +} // namespace detail + CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 3932ac74c68..1a06c25cb92 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -133,7 +133,7 @@ struct policy_hub static constexpr int t_scale(int nominalItemsPerThread) { - return ::cuda::std::max(nominalItemsPerThread / NumActiveChannels / v_scale, 1); + return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1); } // SM35 diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 02bfb443fc1..41fbb2c49a4 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -610,7 +610,7 @@ struct sm90_tuning struct policy_hub { - static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(KeyT), sizeof(AccumT))); + static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(KeyT), sizeof(AccumT))); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT); template diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 33771f6882f..87631d1199e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -236,7 +236,7 @@ struct sm90_tuning struct policy_hub { - static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(KeyT), sizeof(LengthT))); + static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(KeyT), sizeof(LengthT))); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT); template diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index fc8add23a22..b3eaa4e513c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -714,7 +714,7 @@ template ; - static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(key_t), sizeof(AccumT))); + static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(key_t), sizeof(AccumT))); static constexpr int combined_input_bytes = static_cast(sizeof(key_t) + sizeof(AccumT)); struct Policy350 : ChainedPolicy<350, Policy350, Policy350> diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh index f2ae69fc091..7e134f7a63f 100644 --- a/cub/cub/grid/grid_barrier.cuh +++ b/cub/cub/grid/grid_barrier.cuh @@ -79,7 +79,7 @@ public: // Threadfence and syncthreads to make sure global writes are visible before // thread-0 reports in with its sync counter __threadfence(); - CTA_SYNC(); + __syncthreads(); if (blockIdx.x == 0) { @@ -89,7 +89,7 @@ public: d_vol_sync[blockIdx.x] = 1; } - CTA_SYNC(); + __syncthreads(); // Wait for everyone else to report in for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) @@ -100,7 +100,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); // Let everyone know it's safe to proceed for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) @@ -122,7 +122,7 @@ public: } } - CTA_SYNC(); + __syncthreads(); } } }; diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index a5ce583a1cc..524217c70ea 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -416,7 +416,7 @@ struct CachingDeviceAllocator // Lock mutex.lock(); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog( "Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_); #endif @@ -527,7 +527,7 @@ struct CachingDeviceAllocator cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with " "stream %lld).\n", device, @@ -572,7 +572,7 @@ struct CachingDeviceAllocator if (error == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", device, (long long) search_key.bytes, @@ -611,7 +611,7 @@ struct CachingDeviceAllocator // Reduce balance and erase entry cached_bytes[device].free -= block_itr->bytes; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks " "(%lld bytes) outstanding.\n", device, @@ -656,7 +656,7 @@ struct CachingDeviceAllocator cached_bytes[device].live += search_key.bytes; mutex.unlock(); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", device, search_key.d_ptr, @@ -678,7 +678,7 @@ struct CachingDeviceAllocator // Copy device pointer to output parameter *d_ptr = search_key.d_ptr; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG if (debug) { _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", @@ -761,7 +761,7 @@ struct CachingDeviceAllocator cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld " "bytes), %lld live blocks outstanding. (%lld bytes)\n", device, @@ -819,7 +819,7 @@ struct CachingDeviceAllocator return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld " "bytes), %lld live blocks (%lld bytes) outstanding.\n", device, @@ -914,7 +914,7 @@ struct CachingDeviceAllocator cached_bytes[current_device].free -= block_bytes; cached_blocks.erase(begin); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", current_device, diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index b1da6a03b5d..a2093ae288b 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -47,6 +47,10 @@ #include #include +#include +#include +#include + // Legacy include; this functionality used to be defined in here. #include @@ -113,27 +117,24 @@ namespace detail static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024; } // namespace detail -template +template struct RegBoundScaling { - enum - { - ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, - ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), - }; + static constexpr int ITEMS_PER_THREAD = + (::cuda::std::max)(1, Nominal4ByteItemsPerThread * 4 / (::cuda::std::max)(4, int{sizeof(T)})); + static constexpr int BLOCK_THREADS = (::cuda::std::min)( + Nominal4ByteBlockThreads, + ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32); }; -template +template struct MemBoundScaling { - enum - { - ITEMS_PER_THREAD = - CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, - ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), - }; + static constexpr int ITEMS_PER_THREAD = (::cuda::std::max)( + 1, (::cuda::std::min)(Nominal4ByteItemsPerThread * 4 / int{sizeof(T)}, Nominal4ByteItemsPerThread * 2)); + static constexpr int BLOCK_THREADS = (::cuda::std::min)( + Nominal4ByteBlockThreads, + ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32); }; #endif // Do not document diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index a6eee36539c..d765b6374aa 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -88,7 +88,7 @@ CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017); // C++17 dialect check: # ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT # if _CCCL_STD_VER < 2017 -CUB_COMP_DEPR_IMPL(CUB requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) +# error CUB requires at least C++17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message. # endif // _CCCL_STD_VER >= 2017 # endif diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh index 275c915e8f2..099408897ad 100644 --- a/cub/cub/util_debug.cuh +++ b/cub/cub/util_debug.cuh @@ -66,22 +66,6 @@ */ # define CUB_DEBUG_SYNC -/** - * @def CUB_DEBUG_HOST_ASSERTIONS - * - * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition - * assertions. - */ -# define CUB_DEBUG_HOST_ASSERTIONS - -/** - * @def CUB_DEBUG_DEVICE_ASSERTIONS - * - * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side - * precondition assertions. - */ -# define CUB_DEBUG_DEVICE_ASSERTIONS - /** * @def CUB_DEBUG_ALL * @@ -94,80 +78,29 @@ #endif // _CCCL_DOXYGEN_INVOKED -// `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only: - -#define CUB_DETAIL_DEBUG_LEVEL_NONE 0 -#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY 1 -#define CUB_DETAIL_DEBUG_LEVEL_LOG 2 -#define CUB_DETAIL_DEBUG_LEVEL_SYNC 3 -#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS 4 -#define CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS 5 -#define CUB_DETAIL_DEBUG_LEVEL_ALL 1000 - -// `CUB_DEBUG_*`: User interfaces: - -// Extra logging, no syncs -#ifdef CUB_DEBUG_LOG -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_LOG -#endif - -// Logging + syncs +// CUB_DEBUG_SYNC also enables CUB_DEBUG_LOG #ifdef CUB_DEBUG_SYNC -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_SYNC -#endif - -// Logging + syncs + host assertions -#ifdef CUB_DEBUG_HOST_ASSERTIONS -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS -#endif - -// Logging + syncs + host assertions + device assertions -#ifdef CUB_DEBUG_DEVICE_ASSERTIONS -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS -#endif - -// All -#ifdef CUB_DEBUG_ALL -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL -#endif - -// Default case, no extra debugging: -#ifndef CUB_DETAIL_DEBUG_LEVEL -# ifdef NDEBUG -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_NONE -# else -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY +# ifndef CUB_DEBUG_LOG +# define CUB_DEBUG_LOG # endif #endif -/* - * `CUB_DETAIL_DEBUG_ENABLE_*`: - * Internal implementation details, used for testing enabled debug features: - */ - -#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_LOG -# define CUB_DETAIL_DEBUG_ENABLE_LOG -#endif - -#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_SYNC -# define CUB_DETAIL_DEBUG_ENABLE_SYNC -#endif - -#if (CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS) \ - || (CUB_DETAIL_DEBUG_LEVEL == CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY) -# define CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS -#endif - -#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS -# define CUB_DETAIL_DEBUG_ENABLE_DEVICE_ASSERTIONS -#endif +// CUB_DEBUG_ALL = CUB_DEBUG_LOG + CUB_DEBUG_SYNC +#ifdef CUB_DEBUG_ALL +# ifndef CUB_DEBUG_LOG +# define CUB_DEBUG_LOG +# endif // CUB_DEBUG_LOG +# ifndef CUB_DEBUG_SYNC +# define CUB_DEBUG_SYNC +# endif // CUB_DEBUG_SYNC +#endif // CUB_DEBUG_ALL /// CUB error reporting macro (prints error messages to stderr) #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) # define CUB_STDERR #endif -#if defined(CUB_STDERR) || defined(CUB_DETAIL_DEBUG_ENABLE_LOG) +#if defined(CUB_STDERR) || defined(CUB_DEBUG_LOG) # include #endif diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index b9e4f5c25e6..add033fd030 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -47,7 +47,6 @@ # pragma system_header #endif // no system header -#include // IWYU pragma: export #include #include // for backward compatibility @@ -81,7 +80,6 @@ struct policy_wrapper_t : PolicyT static constexpr int BLOCK_THREADS = BLOCK_THREADS_; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; }; -} // namespace detail /** * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device @@ -90,6 +88,8 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel() {} +} // namespace detail + #endif // _CCCL_DOXYGEN_INVOKED /** @@ -278,7 +278,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) // Instantiate `EmptyKernel` in both host and device code to ensure // it can be called. using EmptyKernelPtr = void (*)(); - EmptyKernelPtr empty_kernel = EmptyKernel; + EmptyKernelPtr empty_kernel = detail::EmptyKernel; // This is necessary for unused variable warnings in host compilers. The // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015. @@ -437,62 +437,28 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = return result; } -/** - * Synchronize the specified \p stream. - */ +//! Synchronize the specified \p stream when called in host code. Otherwise, does nothing. CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { - cudaError_t result = cudaErrorNotSupported; - - NV_IF_TARGET(NV_IS_HOST, - (result = CubDebug(cudaStreamSynchronize(stream));), - ((void) stream; result = CubDebug(cub::detail::device_synchronize());)); - - return result; + NV_IF_TARGET( + NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), ((void) stream; return cudaErrorNotSupported;)); } namespace detail { - -/** - * Same as SyncStream, but intended for use with the debug_synchronous flags - * in device algorithms. This should not be used if synchronization is required - * for correctness. - * - * If `debug_synchronous` is false, this function will immediately return - * cudaSuccess. If true, one of the following will occur: - * - * If synchronization is supported by the current compilation target and - * settings, the sync is performed and the sync result is returned. - * - * If syncs are not supported then no sync is performed, but a message is logged - * via _CubLog and cudaSuccess is returned. - */ +//! If CUB_DEBUG_SYNC is defined and this function is called from host code, a sync is performed and the +//! sync result is returned. Otherwise, does nothing. CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) { -#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC - +#ifndef CUB_DEBUG_SYNC (void) stream; return cudaSuccess; - -#else // CUB_DETAIL_DEBUG_ENABLE_SYNC: - -# define CUB_TMP_SYNC_AVAILABLE \ - _CubLog("%s\n", "Synchronizing..."); \ - return SyncStream(stream) - -# define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \ - (void) stream; \ - _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \ - "device-side sync requires -constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) - -> decltype(t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u)) -{ - return t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u); -} - -template -constexpr _CCCL_HOST_DEVICE auto max CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) - -> decltype(t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t)) -{ - return t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t); -} -# undef CUB_PREVENT_MACRO_SUBSTITUTION -#endif - #ifndef CUB_MAX /// Select maximum(a, b) +/// Deprecated since [2.8] # define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) #endif #ifndef CUB_MIN /// Select minimum(a, b) +/// Deprecated since [2.8] # define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) #endif #ifndef CUB_QUOTIENT_FLOOR /// Quotient of x/y rounded down to nearest integer +/// Deprecated since [2.8] # define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) #endif #ifndef CUB_QUOTIENT_CEILING /// Quotient of x/y rounded up to nearest integer +/// Deprecated since [2.8] // FIXME(bgruber): the following computation can overflow, use cuda::ceil_div instead # define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) #endif #ifndef CUB_ROUND_UP_NEAREST /// x rounded up to the nearest multiple of y +/// Deprecated since [2.8] # define CUB_ROUND_UP_NEAREST(x, y) (CUB_QUOTIENT_CEILING(x, y) * y) #endif #ifndef CUB_ROUND_DOWN_NEAREST /// x rounded down to the nearest multiple of y +/// Deprecated since [2.8] # define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) #endif diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh index 9578c84319b..1a3940f6146 100644 --- a/cub/cub/util_math.cuh +++ b/cub/cub/util_math.cuh @@ -43,6 +43,8 @@ #endif // no system header #include +#include +#include #include CUB_NAMESPACE_BEGIN @@ -66,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O { static_assert(::cuda::std::is_integral::value, "OffsetT must be an integral type"); static_assert(sizeof(OffsetT) >= 4, "OffsetT must be at least 32 bits in size"); - auto const capped_operand_rhs = (cub::min)(rhs, ::cuda::std::numeric_limits::max() - lhs); + auto const capped_operand_rhs = (::cuda::std::min)(rhs, ::cuda::std::numeric_limits::max() - lhs); return lhs + capped_operand_rhs; } @@ -74,14 +76,15 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) { - return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes)); + return (::cuda::std::min)(nominal_4b_items_per_thread, + (::cuda::std::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes)); } template constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItems(int nominal_4b_items_per_thread) { - return (cub::min)(nominal_4b_items_per_thread, - (cub::max)(1, nominal_4b_items_per_thread * 4 / static_cast(sizeof(T)))); + return (::cuda::std::min)(nominal_4b_items_per_thread, + (::cuda::std::max)(1, nominal_4b_items_per_thread * 4 / static_cast(sizeof(T)))); } template @@ -89,10 +92,11 @@ constexpr _CCCL_HOST_DEVICE int Nominal8BItemsToItems(int nominal_8b_items_per_t { return sizeof(ItemT) <= 8u ? nominal_8b_items_per_thread - : (cub::min)(nominal_8b_items_per_thread, - (cub::max)(1, - ((nominal_8b_items_per_thread * 8) + static_cast(sizeof(ItemT)) - 1) - / static_cast(sizeof(ItemT)))); + : (::cuda::std::min)( + nominal_8b_items_per_thread, + (::cuda::std::max)(1, + ((nominal_8b_items_per_thread * 8) + static_cast(sizeof(ItemT)) - 1) + / static_cast(sizeof(ItemT)))); } /** diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index aa522d9576e..99beeed313e 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -52,34 +52,10 @@ CUB_NAMESPACE_BEGIN * Inlined PTX intrinsics ******************************************************************************/ -namespace detail -{ -/** - * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p - * num_bits is larger than 32 bits, @p num_bits is clamped to 32. - */ -_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits) -{ - uint32_t ret{}; - asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); - return ret; -} - -/** - * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p - * num_bits is larger than 32 bits, @p num_bits is clamped to 32. - */ -_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits) -{ - uint32_t ret{}; - asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); - return ret; -} -} // namespace detail - /** * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; @@ -90,6 +66,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int /** * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; @@ -150,6 +127,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned in /** * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE void BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits) { @@ -159,6 +137,7 @@ BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, u /** * \brief Three-operand add. Returns \p x + \p y + \p z. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) { asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); @@ -192,6 +171,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y * \endcode * */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index) { int ret; @@ -204,6 +184,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned /** * Sync-threads barrier. */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count) { asm volatile("bar.sync 1, %0;" : : "r"(count)); @@ -212,6 +193,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count) /** * CTA barrier */ +CCCL_DEPRECATED_BECAUSE("use __syncthreads() instead") _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC() { __syncthreads(); @@ -220,6 +202,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC() /** * CTA barrier with predicate */ +CCCL_DEPRECATED_BECAUSE("use __syncthreads_and() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p) { return __syncthreads_and(p); @@ -228,6 +211,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p) /** * CTA barrier with predicate */ +CCCL_DEPRECATED_BECAUSE("use __syncthreads_or() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p) { return __syncthreads_or(p); @@ -236,6 +220,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p) /** * Warp barrier */ +CCCL_DEPRECATED_BECAUSE("use __syncwarp() instead") _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask) { __syncwarp(member_mask); @@ -244,6 +229,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask) /** * Warp any */ +CCCL_DEPRECATED_BECAUSE("use __any_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask) { return __any_sync(member_mask, predicate); @@ -252,6 +238,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_m /** * Warp any */ +CCCL_DEPRECATED_BECAUSE("use __all_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask) { return __all_sync(member_mask, predicate); @@ -260,6 +247,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_m /** * Warp ballot */ +CCCL_DEPRECATED_BECAUSE("use __ballot_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask) { return __ballot_sync(member_mask, predicate); @@ -292,6 +280,7 @@ SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member /** * Warp synchronous shfl_idx */ +CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) { @@ -304,6 +293,7 @@ SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_ma /** * Warp synchronous shfl_idx */ +CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) { return __shfl_sync(member_mask, word, src_lane); @@ -312,6 +302,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int /** * Floating point multiply. (Mantissa LSB rounds towards zero.) */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b) { float d; @@ -322,6 +313,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b) /** * Floating point multiply-add. (Mantissa LSB rounds towards zero.) */ +CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") _CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c) { float d; @@ -342,6 +334,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit() /** * \brief Abort execution and generate an interrupt to the host CPU */ +CCCL_DEPRECATED_BECAUSE("use cuda::std::terminate() instead") _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap() { asm volatile("trap;"); @@ -359,6 +352,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y, /** * \brief Returns the warp lane ID of the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_laneid() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId() { unsigned int ret; @@ -370,6 +364,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId() * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not * correspond to a zero-based ranking within the thread block. */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_warpid() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId() { unsigned int ret; @@ -409,6 +404,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id) /** * \brief Returns the warp lane mask of all lanes less than the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_lt() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt() { unsigned int ret; @@ -419,6 +415,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt() /** * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_le() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe() { unsigned int ret; @@ -429,6 +426,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe() /** * \brief Returns the warp lane mask of all lanes greater than the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_gt() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt() { unsigned int ret; @@ -439,6 +437,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt() /** * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread */ +CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_ge() instead") _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe() { unsigned int ret; @@ -659,12 +658,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_t template _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned int member_mask) { - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - enum - { - SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) - }; - using ShuffleWord = typename UnitWord::ShuffleWord; constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); @@ -674,18 +667,14 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in ShuffleWord* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; - shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[0], src_lane, SHFL_C, member_mask); - + shuffle_word = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS); output_alias[0] = shuffle_word; - #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { - shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[WORD], src_lane, SHFL_C, member_mask); - + shuffle_word = __shfl_sync(member_mask, (unsigned int) input_alias[WORD], src_lane, LOGICAL_WARP_THREADS); output_alias[WORD] = shuffle_word; } - return output; } @@ -750,6 +739,28 @@ struct warp_matcher_t } }; +/** + * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p + * num_bits is larger than 32 bits, @p num_bits is clamped to 32. + */ +_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits) +{ + uint32_t ret{}; + asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); + return ret; +} + +/** + * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p + * num_bits is larger than 32 bits, @p num_bits is clamped to 32. + */ +_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits) +{ + uint32_t ret{}; + asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); + return ret; +} + } // namespace detail #endif // _CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh index f5926ce11e5..baba489c0ae 100644 --- a/cub/cub/util_vsmem.cuh +++ b/cub/cub/util_vsmem.cuh @@ -168,7 +168,7 @@ public: static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) { // Ensure all threads finished using temporary storage - CTA_SYNC(); + __syncthreads(); const std::size_t linear_tid = threadIdx.x; const std::size_t block_stride = line_size * blockDim.x; diff --git a/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/cub/cub/warp/specializations/warp_exchange_shfl.cuh index 5abfa7cdd2f..f874f961caa 100644 --- a/cub/cub/warp/specializations/warp_exchange_shfl.cuh +++ b/cub/cub/warp/specializations/warp_exchange_shfl.cuh @@ -40,6 +40,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -273,8 +275,8 @@ public: WarpExchangeShfl() = delete; explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeShfl(TempStorage&) - : lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) - , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) + : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) + , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh index aabb9e291e9..35b688f813c 100644 --- a/cub/cub/warp/specializations/warp_exchange_smem.cuh +++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh @@ -46,6 +46,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -88,8 +90,8 @@ public: explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeSmem(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) - , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) + , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) + , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} @@ -102,7 +104,7 @@ public: const int idx = ITEMS_PER_THREAD * lane_id + item; temp_storage.items_shared[idx] = input_items[item]; } - WARP_SYNC(member_mask); + __syncwarp(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { @@ -120,7 +122,7 @@ public: const int idx = LOGICAL_WARP_THREADS * item + lane_id; temp_storage.items_shared[idx] = input_items[item]; } - WARP_SYNC(member_mask); + __syncwarp(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { @@ -147,13 +149,13 @@ public: { if (INSERT_PADDING) { - ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); + ranks[ITEM] = (ranks[ITEM] >> LOG_SMEM_BANKS) + ranks[ITEM]; } temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM]; } - WARP_SYNC(member_mask); + __syncwarp(member_mask); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -162,7 +164,7 @@ public: if (INSERT_PADDING) { - item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset; } output_items[ITEM] = temp_storage.items_shared[item_offset]; diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index 3e0db152123..8c4ad78d1ad 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -48,6 +48,7 @@ #include #include +#include #include #include @@ -82,8 +83,6 @@ template struct reduce_max_exists : ::cuda::std::true_type {}; -} // namespace detail - /** * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned * across a CUDA thread warp. @@ -155,7 +154,7 @@ struct WarpReduceShfl /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceShfl(TempStorage& /*temp_storage*/) - : lane_id(static_cast(LaneId())) + : lane_id(static_cast(::cuda::ptx::get_sreg_laneid())) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { @@ -699,7 +698,7 @@ struct WarpReduceShfl _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op) { // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); + int warp_flags = __ballot_sync(member_mask, flag); // Convert to tail-segmented if (HEAD_SEGMENTED) @@ -708,7 +707,7 @@ struct WarpReduceShfl } // Mask out the bits below the current thread - warp_flags &= LaneMaskGe(); + warp_flags &= ::cuda::ptx::get_sreg_lanemask_ge(); // Mask of physical lanes outside the logical warp and convert to logical lanemask if (!IS_ARCH_WARP) @@ -738,5 +737,11 @@ struct WarpReduceShfl return output; } }; +} // namespace detail + +template +using WarpReduceShfl CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpReduceShfl; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index 87b38db2aa3..ade195ee6cb 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -49,8 +49,11 @@ #include #include -CUB_NAMESPACE_BEGIN +#include +CUB_NAMESPACE_BEGIN +namespace detail +{ /** * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned * across a CUDA thread warp. @@ -123,8 +126,8 @@ struct WarpReduceSmem /// Constructor explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceSmem(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) - , member_mask(WarpMask(LaneId() / LOGICAL_WARP_THREADS)) + , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS) + , member_mask(WarpMask(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** @@ -159,7 +162,7 @@ struct WarpReduceSmem // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update input if peer_addend is in range if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) @@ -168,7 +171,7 @@ struct WarpReduceSmem input = reduction_op(input, peer_addend); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); return ReduceStep(input, valid_items, reduction_op, Int2Type()); } @@ -222,7 +225,7 @@ struct WarpReduceSmem SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type /*has_ballot*/) { // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); + int warp_flags = __ballot_sync(member_mask, flag); if (!HEAD_SEGMENTED) { @@ -230,12 +233,12 @@ struct WarpReduceSmem } // Keep bits above the current thread. - warp_flags &= LaneMaskGt(); + warp_flags &= ::cuda::ptx::get_sreg_lanemask_gt(); // Accommodate packing of multiple logical warps in a single physical warp if (!IS_ARCH_WARP) { - warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + warp_flags >>= (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; } // Find next flag @@ -255,7 +258,7 @@ struct WarpReduceSmem // Share input into buffer ThreadStore(&temp_storage.reduce[lane_id], input); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update input if peer_addend is in range if (OFFSET + lane_id < next_flag) @@ -264,7 +267,7 @@ struct WarpReduceSmem input = reduction_op(input, peer_addend); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); } return input; @@ -311,12 +314,12 @@ struct WarpReduceSmem // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Get peer from buffer T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Share flag through buffer flag_storage[lane_id] = flag_status; @@ -409,5 +412,10 @@ struct WarpReduceSmem return SegmentedReduce(input, flag, reduction_op, Int2Type()); } }; +} // namespace detail +template +using WarpReduceSmem CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpReduceSmem; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index c3952b96b4f..402b476c4e4 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -48,8 +48,11 @@ #include #include -CUB_NAMESPACE_BEGIN +#include +CUB_NAMESPACE_BEGIN +namespace detail +{ /** * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. @@ -116,7 +119,7 @@ struct WarpScanShfl /// Constructor explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanShfl(TempStorage& /*temp_storage*/) - : lane_id(LaneId()) + : lane_id(::cuda::ptx::get_sreg_laneid()) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { @@ -511,7 +514,7 @@ struct WarpScanShfl // Iterate scan steps int segment_first_lane = 0; -// Iterate scan steps + // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { @@ -540,15 +543,15 @@ struct WarpScanShfl KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); - unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); + unsigned int ballot = __ballot_sync(member_mask, (pred_key != inclusive_output.key)); // Mask away all lanes greater than ours - ballot = ballot & LaneMaskLe(); + ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le(); // Find index of first set bit int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); -// Iterate scan steps + // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { @@ -672,5 +675,11 @@ struct WarpScanShfl Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); } }; +} // namespace detail + +template +using WarpScanShfl CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpScanShfl; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh index 90bdfbf361a..090f0f96cb5 100644 --- a/cub/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/cub/warp/specializations/warp_scan_smem.cuh @@ -49,8 +49,11 @@ #include #include -CUB_NAMESPACE_BEGIN +#include +CUB_NAMESPACE_BEGIN +namespace detail +{ /** * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. @@ -113,10 +116,10 @@ struct WarpScanSmem : temp_storage(temp_storage.Alias()) , - lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) + lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS) , - member_mask(WarpMask(LaneId() / LOGICAL_WARP_THREADS)) + member_mask(WarpMask(::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** @@ -132,7 +135,7 @@ struct WarpScanSmem // Share partial into buffer ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update partial if addend is in range if (HAS_IDENTITY || (lane_id >= OFFSET)) @@ -140,7 +143,7 @@ struct WarpScanSmem T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); partial = scan_op(addend, partial); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); ScanStep(partial, scan_op, Int2Type()); } @@ -171,7 +174,7 @@ struct WarpScanSmem T identity = 0; ThreadStore(&temp_storage[lane_id], (CellT) identity); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Iterate scan steps output = input; @@ -226,7 +229,7 @@ struct WarpScanSmem ThreadStore(temp_storage, (CellT) input); } - WARP_SYNC(member_mask); + __syncwarp(member_mask); return (T) ThreadLoad(temp_storage); } @@ -276,11 +279,11 @@ struct WarpScanSmem // Retrieve aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); - WARP_SYNC(member_mask); + __syncwarp(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - WARP_SYNC(member_mask); + __syncwarp(member_mask); } //--------------------------------------------------------------------- @@ -307,7 +310,7 @@ struct WarpScanSmem // initial value unknown ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); } @@ -334,7 +337,7 @@ struct WarpScanSmem inclusive = scan_op(initial_value, inclusive); ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); if (lane_id == 0) @@ -364,7 +367,7 @@ struct WarpScanSmem // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); @@ -385,7 +388,7 @@ struct WarpScanSmem // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); exclusive = inclusive - input; @@ -408,11 +411,11 @@ struct WarpScanSmem // Broadcast warp aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - WARP_SYNC(member_mask); + __syncwarp(member_mask); // Update inclusive with initial value inclusive = scan_op(initial_value, inclusive); @@ -420,7 +423,7 @@ struct WarpScanSmem // Get exclusive from exclusive ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); - WARP_SYNC(member_mask); + __syncwarp(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); @@ -430,5 +433,11 @@ struct WarpScanSmem } } }; +} // namespace detail + +template +using WarpScanSmem CCCL_DEPRECATED_BECAUSE( + "This class is considered an implementation detail and the public interface will be " + "removed.") = detail::WarpScanSmem; CUB_NAMESPACE_END diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh index ac5c700b958..3f11129c35a 100644 --- a/cub/cub/warp/warp_load.cuh +++ b/cub/cub/warp/warp_load.cuh @@ -46,6 +46,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -438,14 +440,16 @@ public: //! shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad() : temp_storage(PrivateStorage()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @brief Collective constructor using the specified memory allocation as //! temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @} end member group diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh index 40e29322c1f..de3d311ae59 100644 --- a/cub/cub/warp/warp_merge_sort.cuh +++ b/cub/cub/warp/warp_merge_sort.cuh @@ -41,6 +41,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -151,8 +152,10 @@ public: WarpMergeSort() = delete; _CCCL_DEVICE _CCCL_FORCEINLINE WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage) - : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) - , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) + : BlockMergeSortStrategyT( + temp_storage, + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) + , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} @@ -164,7 +167,7 @@ public: private: _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const { - WARP_SYNC(member_mask); + __syncwarp(member_mask); } friend BlockMergeSortStrategyT; diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 00440c18bdf..4b2c61e343a 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -174,8 +174,8 @@ public: /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two - using InternalWarpReduce = - ::cuda::std::_If, WarpReduceSmem>; + using InternalWarpReduce = ::cuda::std:: + _If, detail::WarpReduceSmem>; #endif // _CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh index 0e0668709b0..6eb6a35562b 100644 --- a/cub/cub/warp/warp_scan.cuh +++ b/cub/cub/warp/warp_scan.cuh @@ -49,6 +49,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -179,8 +180,8 @@ private: /// Internal specialization. /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two - using InternalWarpScan = - ::cuda::std::_If, WarpScanSmem>; + using InternalWarpScan = ::cuda::std:: + _If, detail::WarpScanSmem>; /// Shared memory storage layout type for WarpScan using _TempStorage = typename InternalWarpScan::TempStorage; @@ -212,7 +213,7 @@ public: //! Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) + , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS) {} //! @} end member group diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh index bb99bc5965e..f0a9929e24f 100644 --- a/cub/cub/warp/warp_store.cuh +++ b/cub/cub/warp/warp_store.cuh @@ -45,6 +45,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN //! @rst @@ -378,14 +380,16 @@ public: //! memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore() : temp_storage(PrivateStorage()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @brief Collective constructor using the specified memory allocation as //! temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) + , linear_tid( + IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)) {} //! @} end member group diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index c86d24754de..5a093526edd 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -260,7 +260,7 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) ) cub_clone_target_properties(${test_target} ${cub_target}) target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test") - target_compile_definitions(${test_target} PRIVATE CUB_DETAIL_DEBUG_ENABLE_SYNC) + target_compile_definitions(${test_target} PRIVATE CUB_DEBUG_SYNC) if ("${test_target}" MATCHES "nvtx_in_usercode") target_link_libraries(${test_target} PRIVATE nvtx3-cpp) diff --git a/cub/test/catch2_large_array_sort_helper.cuh b/cub/test/catch2_large_array_sort_helper.cuh index 6c0ed2a48be..4f80a2cd595 100644 --- a/cub/test/catch2_large_array_sort_helper.cuh +++ b/cub/test/catch2_large_array_sort_helper.cuh @@ -67,7 +67,7 @@ template class key_sort_ref_key_transform { static constexpr double max_key = static_cast(::cuda::std::numeric_limits::max()); - const double m_conversion; + double m_conversion; std::size_t m_num_items; bool m_is_descending; @@ -140,7 +140,7 @@ public: _CCCL_HOST_DEVICE KeyType operator()(std::size_t idx) const { // The final summary may be padded, so truncate the summary_idx at the last valid idx: - const std::size_t summary_idx = thrust::min(m_num_summaries - 1, idx / m_unpadded_run_size); + const std::size_t summary_idx = cuda::std::min(m_num_summaries - 1, idx / m_unpadded_run_size); const KeyType key = m_is_descending ? static_cast((m_num_summaries - 1 - summary_idx) * m_key_conversion) : static_cast(summary_idx * m_key_conversion); diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh index fe188a17bf1..642b2aed4f1 100644 --- a/cub/test/catch2_radix_sort_helper.cuh +++ b/cub/test/catch2_radix_sort_helper.cuh @@ -41,6 +41,7 @@ #include #include +#include #include #include @@ -54,7 +55,7 @@ // Index types used for OffsetsT testing using offset_types = c2h::type_list; using all_offset_types = - c2h::type_list; + c2h::type_list; // Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory template @@ -62,35 +63,13 @@ struct segment_iterator { OffsetT last = 0; - segment_iterator(OffsetT last1) + segment_iterator(std::int64_t last1) : last{last1} {} - __host__ __device__ OffsetT operator()(OffsetT x) const + __host__ __device__ OffsetT operator()(std::int64_t x) const { - switch (x) - { - case Step * 100: - return Step * 100 + Step / 2; - case Step * 200: - return Step * 200 + Step / 2; - case Step * 300: - return Step * 300 + Step / 2; - case Step * 400: - return Step * 400 + Step / 2; - case Step * 500: - return Step * 500 + Step / 2; - case Step * 600: - return Step * 600 + Step / 2; - case Step * 700: - return Step * 700 + Step / 2; - case Step * 800: - return Step * 800 + Step / 2; - case Step * 900: - return Step * 900 + Step / 2; - default: - return (x >= last) ? last : x * Step; - } + return (::cuda::std::min)(last, x * Step); } }; diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh index 0852921bebf..f8a081a125a 100644 --- a/cub/test/catch2_segmented_sort_helper.cuh +++ b/cub/test/catch2_segmented_sort_helper.cuh @@ -26,7 +26,6 @@ ******************************************************************************/ #pragma once -// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT #include #include @@ -38,6 +37,7 @@ #include #include #include +#include #include #include @@ -46,11 +46,11 @@ #include +#include "catch2_test_launch_helper.h" #include #include #include #include -#include #include #define MAKE_SEED_MOD_FUNCTION(name, xor_mask) \ @@ -71,6 +71,194 @@ MAKE_SEED_MOD_FUNCTION(offset_eraser, 0x3333333333333333) #undef MAKE_SEED_MOD_FUNCTION +// Helper to generate a certain number of empty segments followed by equi-sized segments. +template +struct segment_index_to_offset_op +{ + SegmentIndexT num_empty_segments; + SegmentIndexT num_segments; + OffsetT segment_size; + OffsetT num_items; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT operator()(SegmentIndexT i) + { + if (i < num_empty_segments) + { + return 0; + } + else if (i < num_segments) + { + return segment_size * static_cast(i - num_empty_segments); + } + else + { + return num_items; + } + } +}; + +template +struct mod_n +{ + std::size_t mod; + + template + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T operator()(IndexT x) + { + return static_cast(x % mod); + } +}; + +template +struct short_key_verification_helper +{ + using key_t = KeyT; + // The histogram size of the keys being sorted for later verification + const std::int64_t max_histo_size = std::int64_t{1} << ::cuda::std::numeric_limits::digits; + + // Holding the histogram of the keys being sorted for verification + c2h::host_vector keys_histogram{}; + +public: + void prepare_verification_data(const c2h::device_vector& in_keys) + { + c2h::host_vector h_in{in_keys}; + keys_histogram = c2h::host_vector(max_histo_size, 0); + for (const auto& key : h_in) + { + keys_histogram[key]++; + } + } + + void verify_sorted(const c2h::device_vector& out_keys) const + { + // Verify keys are sorted next to each other + auto count = thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to()); + REQUIRE(count <= max_histo_size); + + // Verify keys are sorted using prior histogram computation + auto index_it = thrust::make_counting_iterator(std::size_t{0}); + c2h::device_vector unique_keys_out(count); + c2h::device_vector unique_indexes_out(count); + thrust::unique_by_key_copy( + c2h::device_policy, + out_keys.cbegin(), + out_keys.cend(), + index_it, + unique_keys_out.begin(), + unique_indexes_out.begin()); + + for (int i = 0; i < count; i++) + { + auto const next_end = (i == count - 1) ? out_keys.size() : unique_indexes_out[i + 1]; + REQUIRE(keys_histogram[unique_keys_out[i]] == next_end - unique_indexes_out[i]); + } + } +}; + +template +class segmented_verification_helper +{ +private: + using key_t = KeyT; + const std::size_t sequence_length{}; + + // Analytically computes the histogram for a segment of a series of keys: [0, 1, 2, ..., mod_n - 1, 0, 1, 2, ...]. + // `segment_end` is one-past-the-end of the segment to compute the histogram for. + c2h::host_vector compute_histogram_of_series(std::size_t segment_offset, std::size_t segment_end) const + { + // The i-th full cycle begins after segment_offset + const auto start_cycle = cuda::ceil_div(segment_offset, sequence_length); + + // The last full cycle ending before segment_end + const auto end_cycle = segment_end / sequence_length; + + // Number of full cycles repeating the sequence + const int full_cycles = (end_cycle > start_cycle) ? static_cast(end_cycle - start_cycle) : 0; + + // Add contributions from full cycles + c2h::host_vector histogram(sequence_length, full_cycles); + + // Partial cycles preceding the first full cycle + for (std::size_t j = segment_offset; j < start_cycle * sequence_length; ++j) + { + const auto value = j % sequence_length; + histogram[value]++; + } + + // Partial cycles following the last full cycle + for (std::size_t j = end_cycle * sequence_length; j < segment_end; ++j) + { + const auto value = j % sequence_length; + histogram[value]++; + } + return histogram; + } + +public: + segmented_verification_helper(int sequence_length) + : sequence_length(sequence_length) + {} + + void prepare_input_data(c2h::device_vector& in_keys) const + { + auto data_gen_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), mod_n{sequence_length}); + thrust::copy_n(data_gen_it, in_keys.size(), in_keys.begin()); + } + + template + void verify_sorted(c2h::device_vector& out_keys, SegmentOffsetItT offsets, std::size_t num_segments) const + { + // The segments' end-offsets are provided by the segments' begin-offset iterator + auto offsets_plus_1 = offsets + 1; + + // Verify keys are sorted next to each other + const auto count = static_cast( + thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to())); + REQUIRE(count <= sequence_length * num_segments); + + // // Verify keys are sorted using prior histogram computation + auto index_it = thrust::make_counting_iterator(std::size_t{0}); + c2h::device_vector unique_keys_out(count); + c2h::device_vector unique_indexes_out(count); + thrust::unique_by_key_copy( + c2h::device_policy, + out_keys.cbegin(), + out_keys.cend(), + index_it, + unique_keys_out.begin(), + unique_indexes_out.begin()); + + // Copy the unique keys and indexes to host memory + c2h::host_vector h_unique_keys_out{unique_keys_out}; + c2h::host_vector h_unique_indexes_out{unique_indexes_out}; + + // Verify keys are sorted using prior histogram computation + std::size_t uniques_index = 0; + std::size_t current_offset = 0; + for (std::size_t seg_index = 0; seg_index < num_segments; ++seg_index) + { + const auto segment_offset = offsets[seg_index]; + const auto segment_end = offsets_plus_1[seg_index]; + const auto segment_histogram = compute_histogram_of_series(segment_offset, segment_end); + for (std::size_t i = 0; i < sequence_length; i++) + { + if (segment_histogram[i] != 0) + { + CAPTURE(seg_index, i, uniques_index, current_offset, count); + auto const next_end = + (uniques_index == count - 1) ? out_keys.size() : h_unique_indexes_out[uniques_index + 1]; + REQUIRE(h_unique_keys_out[uniques_index] == i); + REQUIRE(next_end - h_unique_indexes_out[uniques_index] == static_cast(segment_histogram[i])); + current_offset += segment_histogram[i]; + uniques_index++; + } + } + } + } +}; + template struct unwrap_value_t_impl { diff --git a/cub/test/catch2_test_block_run_length_decode.cu b/cub/test/catch2_test_block_run_length_decode.cu index cf080e173d7..dc322e49f8a 100644 --- a/cub/test/catch2_test_block_run_length_decode.cu +++ b/cub/test/catch2_test_block_run_length_decode.cu @@ -104,7 +104,7 @@ private: BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size); // Ensure temporary shared memory can be repurposed - cub::CTA_SYNC(); + __syncthreads(); // Construct BlockRunLengthDecode and initialize with the run offsets return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets); @@ -137,7 +137,7 @@ private: } // Ensure BlockLoad's temporary shared memory can be repurposed - cub::CTA_SYNC(); + __syncthreads(); // Load this block's tile of run lengths if (num_valid_items < RUNS_PER_BLOCK) @@ -151,7 +151,7 @@ private: } // Ensure temporary shared memory can be repurposed - cub::CTA_SYNC(); + __syncthreads(); } public: diff --git a/cub/test/catch2_test_debug.cu b/cub/test/catch2_test_debug.cu index 3293ca6b7d7..a158ff9afd1 100644 --- a/cub/test/catch2_test_debug.cu +++ b/cub/test/catch2_test_debug.cu @@ -11,7 +11,7 @@ TEST_CASE("CubDebug returns input error", "[debug][utils]") TEST_CASE("CubDebug returns new errors", "[debug][utils]") { - cub::EmptyKernel<<<0, 0>>>(); + cub::detail::EmptyKernel<<<0, 0>>>(); cudaError error = cudaPeekAtLastError(); REQUIRE(error != cudaSuccess); @@ -20,7 +20,7 @@ TEST_CASE("CubDebug returns new errors", "[debug][utils]") TEST_CASE("CubDebug prefers input errors", "[debug][utils]") { - cub::EmptyKernel<<<0, 0>>>(); + cub::detail::EmptyKernel<<<0, 0>>>(); cudaError error = cudaPeekAtLastError(); REQUIRE(error != cudaSuccess); @@ -29,7 +29,7 @@ TEST_CASE("CubDebug prefers input errors", "[debug][utils]") TEST_CASE("CubDebug resets last error", "[debug][utils]") { - cub::EmptyKernel<<<0, 0>>>(); + cub::detail::EmptyKernel<<<0, 0>>>(); cudaError error = cudaPeekAtLastError(); REQUIRE(error != cudaSuccess); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu index 561290ce075..b58109cb657 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu @@ -34,7 +34,6 @@ #include -#include "catch2/catch.hpp" #include "catch2_test_launch_helper.h" #include diff --git a/cub/test/catch2_test_device_segmented_sort_keys.cu b/cub/test/catch2_test_device_segmented_sort_keys.cu index 823665ee0ef..3d392e8e8f6 100644 --- a/cub/test/catch2_test_device_segmented_sort_keys.cu +++ b/cub/test/catch2_test_device_segmented_sort_keys.cu @@ -24,62 +24,20 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ + #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first +#include + #include "catch2_radix_sort_helper.cuh" +#include "catch2_segmented_sort_helper.cuh" #include -#include // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for // graph launch. - -// TODO replace with DeviceSegmentedSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_wrapper( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - NumItemsT num_items, - NumItemsT num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - bool* selector, - bool is_overwrite = false, - cudaStream_t stream = 0) -{ - cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - cub::DoubleBuffer d_values; - auto status = - cub::DispatchSegmentedSort:: - Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - is_overwrite, - stream); - if (status != cudaSuccess) - { - return status; - } - if (is_overwrite) - { - // Only write to selector in the DoubleBuffer invocation - *selector = d_keys.Current() != d_keys_out; - } - return cudaSuccess; -} - // %PARAM% TEST_LAUNCH lid 0:1 -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper, dispatch_segmented_sort_descending); -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper, dispatch_segmented_sort); +DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedSort::StableSortKeys, stable_sort_keys); using key_types = c2h::type_list(C2H_SEED(4)); } -#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) - -// we can reuse the same structure of DeviceSegmentedRadixSortKeys for simplicity -C2H_TEST("DeviceSegmentedSortKeys: very large num. items and num. segments", - "[keys][segmented][sort][device]", - all_offset_types) +C2H_TEST("DeviceSegmentedSortKeys: very large number of segments", "[keys][segmented][sort][device]", all_offset_types) try { - using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs - using offset_t = c2h::get<0, TestType>; - constexpr std::size_t Step = 500; - using segment_iterator_t = segment_iterator; - constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); - constexpr int num_key_seeds = 1; - const bool is_descending = GENERATE(false, true); - const bool is_overwrite = GENERATE(false, true); + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using segment_offset_t = std::int64_t; + using offset_t = c2h::get<0, TestType>; + using segment_iterator_t = segment_index_to_offset_op; + constexpr std::size_t segment_size = 1000000; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); - const std::size_t num_segments = ::cuda::ceil_div(num_items, Step); - CAPTURE(c2h::type_name(), num_items, num_segments, is_descending, is_overwrite); + constexpr segment_offset_t num_empty_segments = uint32_max; + const segment_offset_t num_segments = num_empty_segments + ::cuda::ceil_div(num_items, segment_size); + CAPTURE(c2h::type_name(), num_items, num_segments); c2h::device_vector in_keys(num_items); c2h::device_vector out_keys(num_items); - c2h::gen(C2H_SEED(num_key_seeds), in_keys); - auto offsets = - thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items}); - auto offsets_plus_1 = offsets + 1; - // Allocate host/device-accessible memory to communicate the selected output buffer - bool* selector_ptr = nullptr; - if (is_overwrite) - { - REQUIRE(cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)) == cudaSuccess); - } - - auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1); - auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); - if (is_descending) - { - dispatch_segmented_sort_descending( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - static_cast(num_items), - static_cast(num_segments), - offsets, - offsets_plus_1, - selector_ptr, - is_overwrite); - } - else - { - dispatch_segmented_sort( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - static_cast(num_items), - static_cast(num_segments), - offsets, - offsets_plus_1, - selector_ptr, - is_overwrite); - } - if (is_overwrite) - { - if (*selector_ptr) - { - std::swap(out_keys, in_keys); - } - REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); - } - REQUIRE((ref_keys == out_keys) == true); + + // Generate input keys + constexpr auto max_histo_size = 250; + segmented_verification_helper verification_helper{max_histo_size}; + verification_helper.prepare_input_data(in_keys); + + auto offsets = thrust::make_transform_iterator( + thrust::make_counting_iterator(std::size_t{0}), + segment_iterator_t{num_empty_segments, num_segments, segment_size, num_items}); + + stable_sort_keys( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets + 1); + + // Verify the keys are sorted correctly + verification_helper.verify_sorted(out_keys, offsets + num_empty_segments, num_segments - num_empty_segments); } catch (std::bad_alloc& e) { @@ -299,15 +225,14 @@ C2H_TEST("DeviceSegmentedSort::SortKeys: very large segments", "[keys][segmented try { using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using segment_offset_t = std::int32_t; using offset_t = c2h::get<0, TestType>; constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); constexpr int num_key_seeds = 1; - const bool is_descending = GENERATE(false, true); - const bool is_overwrite = GENERATE(false, true); constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); - const std::size_t num_segments = 2; - CAPTURE(c2h::type_name(), num_items, is_descending, is_overwrite); + const segment_offset_t num_segments = 2; + CAPTURE(c2h::type_name(), num_items, num_segments); c2h::device_vector in_keys(num_items); c2h::device_vector out_keys(num_items); @@ -317,51 +242,22 @@ try offsets[1] = static_cast(num_items); offsets[2] = static_cast(num_items); - // Allocate host/device-accessible memory to communicate the selected output buffer - bool* selector_ptr = nullptr; - if (is_overwrite) - { - REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); - } - auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets); - auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); - if (is_descending) - { - dispatch_segmented_sort_descending( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - static_cast(num_items), - static_cast(num_segments), - thrust::raw_pointer_cast(offsets.data()), - offsets.cbegin() + 1, - selector_ptr, - is_overwrite); - } - else - { - dispatch_segmented_sort( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - static_cast(num_items), - static_cast(num_segments), - thrust::raw_pointer_cast(offsets.data()), - offsets.cbegin() + 1, - selector_ptr, - is_overwrite); - } - if (is_overwrite) - { - if (*selector_ptr) - { - std::swap(out_keys, in_keys); - } - REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr)); - } - REQUIRE((ref_keys == out_keys) == true); + // Prepare information for later verification + short_key_verification_helper verification_helper{}; + verification_helper.prepare_verification_data(in_keys); + + stable_sort_keys( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1); + + // Verify the keys are sorted correctly + verification_helper.verify_sorted(out_keys); } catch (std::bad_alloc& e) { std::cerr << "Skipping segmented sort test, insufficient GPU memory. " << e.what() << "\n"; } - -#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) diff --git a/cub/test/catch2_test_device_segmented_sort_pairs.cu b/cub/test/catch2_test_device_segmented_sort_pairs.cu index a3034608076..f24d30dbed1 100644 --- a/cub/test/catch2_test_device_segmented_sort_pairs.cu +++ b/cub/test/catch2_test_device_segmented_sort_pairs.cu @@ -27,66 +27,14 @@ #include "catch2_radix_sort_helper.cuh" // above header needs to be included first +#include "catch2_segmented_sort_helper.cuh" #include -#include // FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for // graph launch. - -// TODO replace with DeviceSegmentedSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_pairs_wrapper( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - NumItemsT num_items, - NumItemsT num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - bool* selector, - bool is_overwrite = false, - cudaStream_t stream = 0) -{ - cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - auto status = cub:: - DispatchSegmentedSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - is_overwrite, - stream); - if (status != cudaSuccess) - { - return status; - } - if (is_overwrite) - { - // Only write to selector in the DoubleBuffer invocation - *selector = d_keys.Current() != d_keys_out; - } - return cudaSuccess; -} - // %PARAM% TEST_LAUNCH lid 0:1 -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper, dispatch_segmented_sort_pairs_descending); -DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper, dispatch_segmented_sort_pairs); +DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedSort::StableSortPairs, stable_sort_pairs); using pair_types = c2h::type_list, @@ -251,90 +199,56 @@ C2H_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values", test_unspecified_segments_random(C2H_SEED(4)); } -#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) - -// we can reuse the same structure of DeviceSegmentedRadixSortPairs for simplicity C2H_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments", "[pairs][segmented][sort][device]", all_offset_types) try { - using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs - using value_t = cuda::std::uint8_t; - using offset_t = c2h::get<0, TestType>; - constexpr std::size_t Step = 500; - using segment_iterator_t = segment_iterator; - constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); - constexpr int num_key_seeds = 1; - constexpr int num_value_seeds = 1; - const bool is_descending = GENERATE(false, true); - const bool is_overwrite = GENERATE(false, true); + using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs + using value_t = cuda::std::uint8_t; + using segment_offset_t = std::int64_t; + using offset_t = c2h::get<0, TestType>; + using segment_iterator_t = segment_index_to_offset_op; + constexpr std::size_t segment_size = 1000000; + constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); - const std::size_t num_segments = ::cuda::ceil_div(num_items, Step); - CAPTURE(c2h::type_name(), num_items, num_segments, is_descending, is_overwrite); + constexpr segment_offset_t num_empty_segments = uint32_max; + const segment_offset_t num_segments = num_empty_segments + ::cuda::ceil_div(num_items, segment_size); + CAPTURE(c2h::type_name(), num_items, num_segments); + // Generate input c2h::device_vector in_keys(num_items); c2h::device_vector in_values(num_items); - c2h::gen(C2H_SEED(num_key_seeds), in_keys); - c2h::gen(C2H_SEED(num_value_seeds), in_values); + constexpr auto max_histo_size = 250; + segmented_verification_helper verification_helper{max_histo_size}; + verification_helper.prepare_input_data(in_keys); + thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin()); // Initialize the output vectors by copying the inputs since not all items may belong to a segment. c2h::device_vector out_keys(num_items); c2h::device_vector out_values(num_items); - auto offsets = - thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items}); + + auto offsets = thrust::make_transform_iterator( + thrust::make_counting_iterator(std::size_t{0}), + segment_iterator_t{num_empty_segments, num_segments, segment_size, num_items}); auto offsets_plus_1 = offsets + 1; - bool* selector_ptr = nullptr; - if (is_overwrite) - { - REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); - } - - auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1); - auto& ref_keys = refs.first; - auto& ref_values = refs.second; - auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); - auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); - if (is_descending) - { - dispatch_segmented_sort_pairs_descending( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - offsets, - offsets_plus_1, - selector_ptr, - is_overwrite); - } - else - { - dispatch_segmented_sort_pairs( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - offsets, - offsets_plus_1, - selector_ptr, - is_overwrite); - } - if (is_overwrite) - { - if (*selector_ptr) - { - std::swap(out_keys, in_keys); - std::swap(out_values, in_values); - } - REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); - } - REQUIRE(ref_keys == out_keys); - REQUIRE(ref_values == out_values); + + stable_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + thrust::raw_pointer_cast(in_values.data()), + thrust::raw_pointer_cast(out_values.data()), + static_cast(num_items), + static_cast(num_segments), + offsets, + offsets_plus_1); + + // Verify the keys are sorted correctly + verification_helper.verify_sorted(out_keys, offsets + num_empty_segments, num_segments - num_empty_segments); + + // Verify values were sorted along with the keys + REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin())); } catch (std::bad_alloc& e) { @@ -346,82 +260,47 @@ try { using key_t = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs using value_t = cuda::std::uint8_t; + using segment_offset_t = std::int32_t; using offset_t = c2h::get<0, TestType>; constexpr std::size_t uint32_max = ::cuda::std::numeric_limits::max(); constexpr int num_key_seeds = 1; - constexpr int num_value_seeds = 1; - const bool is_descending = GENERATE(false, true); - const bool is_overwrite = GENERATE(false, true); constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits::max(); - constexpr std::size_t num_segments = 2; - CAPTURE(c2h::type_name(), num_items, is_descending, is_overwrite); + constexpr segment_offset_t num_segments = 2; + CAPTURE(c2h::type_name(), num_items, num_segments); c2h::device_vector in_keys(num_items); c2h::device_vector in_values(num_items); c2h::device_vector out_keys(num_items); c2h::gen(C2H_SEED(num_key_seeds), in_keys); - c2h::gen(C2H_SEED(num_value_seeds), in_values); + thrust::copy(in_keys.cbegin(), in_keys.cend(), in_values.begin()); c2h::device_vector out_values(num_items); c2h::device_vector offsets(num_segments + 1); - offsets[0] = 0; - offsets[1] = static_cast(num_items); - offsets[2] = static_cast(num_items); - bool* selector_ptr = nullptr; - if (is_overwrite) - { - REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr))); - } - - auto refs = segmented_radix_sort_reference( - in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1); - auto& ref_keys = refs.first; - auto& ref_values = refs.second; - auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data()); - auto out_values_ptr = thrust::raw_pointer_cast(out_values.data()); - if (is_descending) - { - dispatch_segmented_sort_pairs_descending( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - thrust::raw_pointer_cast(offsets.data()), - offsets.cbegin() + 1, - selector_ptr, - is_overwrite); - } - else - { - dispatch_segmented_sort_pairs( - thrust::raw_pointer_cast(in_keys.data()), - out_keys_ptr, - thrust::raw_pointer_cast(in_values.data()), - out_values_ptr, - static_cast(num_items), - static_cast(num_segments), - thrust::raw_pointer_cast(offsets.data()), - offsets.cbegin() + 1, - selector_ptr, - is_overwrite); - } - if (is_overwrite) - { - if (*selector_ptr) - { - std::swap(out_keys, in_keys); - std::swap(out_values, in_values); - } - REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess); - } - REQUIRE(ref_keys == out_keys); - REQUIRE(ref_values == out_values); + offsets[0] = 0; + offsets[1] = static_cast(num_items); + offsets[2] = static_cast(num_items); + + // Prepare information for later verification + short_key_verification_helper verification_helper{}; + verification_helper.prepare_verification_data(in_keys); + + stable_sort_pairs( + thrust::raw_pointer_cast(in_keys.data()), + thrust::raw_pointer_cast(out_keys.data()), + thrust::raw_pointer_cast(in_values.data()), + thrust::raw_pointer_cast(out_values.data()), + static_cast(num_items), + static_cast(num_segments), + thrust::raw_pointer_cast(offsets.data()), + offsets.cbegin() + 1); + + // Verify the keys are sorted correctly + verification_helper.verify_sorted(out_keys); + + // Verify values were sorted along with the keys + REQUIRE(thrust::equal(out_keys.cbegin(), out_keys.cend(), out_values.cbegin())); } catch (std::bad_alloc& e) { std::cerr << "Skipping segmented sort test, insufficient GPU memory. " << e.what() << "\n"; } - -#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT) diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu index cf86389d68c..6b16bde7fa9 100644 --- a/cub/test/catch2_test_vsmem.cu +++ b/cub/test/catch2_test_vsmem.cu @@ -33,7 +33,6 @@ #include #include -#include "catch2/catch.hpp" #include "catch2_test_launch_helper.h" #include diff --git a/cub/test/catch2_test_warp_merge_sort.cu b/cub/test/catch2_test_warp_merge_sort.cu index 7b245ebba33..fa4f986ad64 100644 --- a/cub/test/catch2_test_warp_merge_sort.cu +++ b/cub/test/catch2_test_warp_merge_sort.cu @@ -88,7 +88,7 @@ __global__ void warp_merge_sort_kernel(T* in, T* out, SegmentSizeItT segment_siz const int idx = thread_offset + item; thread_data[item] = in[idx]; } - cub::WARP_SYNC(warp_sort.get_member_mask()); + __syncwarp(warp_sort.get_member_mask()); // Run merge sort test action(warp_sort, thread_data, valid_items, oob_default); @@ -153,7 +153,7 @@ __global__ void warp_merge_sort_kernel( keys[item] = keys_in[idx]; values[item] = values_in[idx]; } - cub::WARP_SYNC(warp_sort.get_member_mask()); + __syncwarp(warp_sort.get_member_mask()); // Run merge sort test action(warp_sort, keys, values, valid_items, oob_default); diff --git a/cub/test/insert_nested_NVTX_range_guard.h b/cub/test/insert_nested_NVTX_range_guard.h index 56d7aad6bc1..9da6cf042b1 100644 --- a/cub/test/insert_nested_NVTX_range_guard.h +++ b/cub/test/insert_nested_NVTX_range_guard.h @@ -5,7 +5,7 @@ #include #include -#include +#include #if defined(__cpp_inline_variables) inline thread_local bool entered = false; diff --git a/cub/test/test_allocator.cu b/cub/test/test_allocator.cu index 4b4723fe997..9628e936a13 100644 --- a/cub/test/test_allocator.cu +++ b/cub/test/test_allocator.cu @@ -105,7 +105,7 @@ int main(int argc, char** argv) CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0)); // Run some big kernel in stream 0 - EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); + detail::EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); // Free d_999B_stream0_a CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); @@ -120,7 +120,7 @@ int main(int argc, char** argv) AssertEquals(allocator.cached_blocks.size(), 0); // Run some big kernel in stream 0 - EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); + detail::EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); // Free d_999B_stream0_b CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); @@ -138,7 +138,7 @@ int main(int argc, char** argv) AssertEquals(allocator.cached_blocks.size(), 1); // Run some big kernel in other_stream - EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); + detail::EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); // Free d_999B_stream_other CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); @@ -170,7 +170,7 @@ int main(int argc, char** argv) AssertEquals(allocator.cached_blocks.size(), 0); // Run some big kernel in other_stream - EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); + detail::EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); // Free d_999B_stream_other_a and d_999B_stream_other_b CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); @@ -388,7 +388,7 @@ int main(int argc, char** argv) // Prime the caching allocator and the kernel CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes)); CubDebugExit(allocator.DeviceFree(d_1024MB)); - cub::EmptyKernel<<<1, 32>>>(); + detail::EmptyKernel<<<1, 32>>>(); // CUDA cpu_timer.Start(); @@ -427,7 +427,7 @@ int main(int argc, char** argv) gpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { - cub::EmptyKernel<<<1, 32>>>(); + detail::EmptyKernel<<<1, 32>>>(); } gpu_timer.Stop(); float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis(); @@ -437,7 +437,7 @@ int main(int argc, char** argv) for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(cudaMalloc((void**) &d_1024MB, timing_bytes)); - cub::EmptyKernel<<<1, 32>>>(); + detail::EmptyKernel<<<1, 32>>>(); CubDebugExit(cudaFree(d_1024MB)); } gpu_timer.Stop(); @@ -448,7 +448,7 @@ int main(int argc, char** argv) for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes)); - cub::EmptyKernel<<<1, 32>>>(); + detail::EmptyKernel<<<1, 32>>>(); CubDebugExit(allocator.DeviceFree(d_1024MB)); } gpu_timer.Stop(); diff --git a/cub/test/test_device_batch_memcpy.cu b/cub/test/test_device_batch_memcpy.cu index 2d550e32fa0..7ddb22cffc0 100644 --- a/cub/test/test_device_batch_memcpy.cu +++ b/cub/test/test_device_batch_memcpy.cu @@ -348,7 +348,7 @@ void RunTest(BufferOffsetT num_buffers, template __global__ void TestVectorizedCopyKernel(const void* d_in, void* d_out, ByteOffsetT copy_size) { - cub::detail::VectorizedCopy(threadIdx.x, d_out, copy_size, d_in); + cub::detail::batch_memcpy::VectorizedCopy(threadIdx.x, d_out, copy_size, d_in); } struct TupleMemberEqualityOp @@ -409,7 +409,7 @@ template __global__ void TestBitPackedCounterKernel(uint32_t* bins, uint32_t* increments, uint32_t* counts_out, uint32_t num_items) { - using BitPackedCounterT = cub::detail::BitPackedCounter; + using BitPackedCounterT = cub::detail::batch_memcpy::BitPackedCounter; BitPackedCounterT counter{}; for (uint32_t i = 0; i < num_items; i++) { diff --git a/cub/test/test_device_spmv.cu b/cub/test/test_device_spmv.cu index 5a120e56e96..13dba77a594 100644 --- a/cub/test/test_device_spmv.cu +++ b/cub/test/test_device_spmv.cu @@ -47,6 +47,8 @@ #include #include +_CCCL_SUPPRESS_DEPRECATED_PUSH + bool g_verbose = false; //============================================================================== @@ -605,3 +607,5 @@ int main(int argc, char** argv) test_types(); } + +_CCCL_SUPPRESS_DEPRECATED_POP diff --git a/cub/test/test_grid_barrier.cu b/cub/test/test_grid_barrier.cu index e763b48d1e2..c3f6bb5eea1 100644 --- a/cub/test/test_grid_barrier.cu +++ b/cub/test/test_grid_barrier.cu @@ -109,7 +109,7 @@ int main(int argc, char** argv) int sm_count, max_block_threads, max_sm_occupancy; CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal)); - CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel, 32)); + CubDebugExit(MaxSmOccupancy(max_sm_occupancy, detail::EmptyKernel, 32)); // Compute grid size and occupancy int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy); diff --git a/cub/test/test_util.h b/cub/test/test_util.h index e61cd7cd6e2..c06d803ecb1 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -614,7 +614,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s case RANDOM_BIT: case RANDOM_MINUS_PLUS_ZERO: _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); + cuda::std::terminate(); break; case UNIFORM: value = 2; @@ -656,7 +656,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool& value case RANDOM_BIT: case RANDOM_MINUS_PLUS_ZERO: _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); + cuda::std::terminate(); break; case UNIFORM: value = true; @@ -697,7 +697,7 @@ InitValue(GenMode gen_mode, CUB_NS_QUALIFIER::KeyValuePair& value, ), ( // NV_IS_DEVICE _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); + cuda::std::terminate(); )); // clang-format on } diff --git a/cub/test/thread_reduce/catch2_test_thread_reduce.cu b/cub/test/thread_reduce/catch2_test_thread_reduce.cu index fe88ea003be..ba7342db9a5 100644 --- a/cub/test/thread_reduce/catch2_test_thread_reduce.cu +++ b/cub/test/thread_reduce/catch2_test_thread_reduce.cu @@ -48,6 +48,7 @@ #include "c2h/custom_type.h" #include "c2h/extended_types.h" #include "c2h/generators.h" +#include /*********************************************************************************************************************** * Thread Reduce Wrapper Kernels @@ -285,7 +286,7 @@ _CCCL_TEMPLATE(typename T) _CCCL_REQUIRES((::cuda::std::is_floating_point::value)) void verify_results(const T& expected_data, const T& test_results) { - REQUIRE(expected_data == Approx(test_results).epsilon(0.05)); + REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05})); } _CCCL_TEMPLATE(typename T) diff --git a/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh index f46c581ceb9..5c1fd9cab59 100644 --- a/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh @@ -118,7 +118,7 @@ struct level_dimensions using level_type = Level; // Needs alignas to work around an issue with tuple - alignas(16) const Dimensions dims; // Unit for dimensions is implicit + alignas(16) Dimensions dims; // Unit for dimensions is implicit _CCCL_HOST_DEVICE constexpr level_dimensions(const Dimensions& d) : dims(d) diff --git a/cudax/include/cuda/experimental/__stf/utility/memory.cuh b/cudax/include/cuda/experimental/__stf/utility/memory.cuh index bd3c895ee16..b0f987ca63b 100644 --- a/cudax/include/cuda/experimental/__stf/utility/memory.cuh +++ b/cudax/include/cuda/experimental/__stf/utility/memory.cuh @@ -29,6 +29,7 @@ #include +#include #include namespace cuda::experimental::stf diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 3c4abf38c91..c6fb198f35b 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -8,9 +8,6 @@ find_package(Thrust ${cudax_VERSION} EXACT CONFIG ) thrust_create_target(cudax.test.thrust) -add_library(catch2_main STATIC catch2_helpers/catch2_main.cpp) -target_link_libraries(catch2_main PUBLIC Catch2::Catch2) - ## cudax_add_test # # Add a catch2 test executable and register it with ctest. @@ -34,8 +31,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test target_link_libraries(${test_target} PRIVATE ${cn_target} cudax.test.thrust - Catch2::Catch2 - catch2_main + Catch2::Catch2WithMain ) target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE" diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh index 661d087f3bc..c3c0f40869e 100644 --- a/cudax/test/algorithm/common.cuh +++ b/cudax/test/algorithm/common.cuh @@ -17,7 +17,7 @@ #include #include -#include +#include #include inline constexpr uint8_t fill_byte = 1; diff --git a/cudax/test/catch2_helpers/catch2_main.cpp b/cudax/test/catch2_helpers/catch2_main.cpp deleted file mode 100644 index 23afde17338..00000000000 --- a/cudax/test/catch2_helpers/catch2_main.cpp +++ /dev/null @@ -1,2 +0,0 @@ -#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do this in one cpp file -#include diff --git a/cudax/test/common/testing.cuh b/cudax/test/common/testing.cuh index 965ae398e7b..65dc438c307 100644 --- a/cudax/test/common/testing.cuh +++ b/cudax/test/common/testing.cuh @@ -11,15 +11,39 @@ #ifndef __COMMON_TESTING_H__ #define __COMMON_TESTING_H__ +#include + #include #include // IWYU pragma: keep #include #include -#include +#include +#include #include +// workaround for error #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop' +#if _CCCL_COMPILER(NVHPC) +# undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +# undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma("diag push") +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma("diag pop") +#endif +// workaround for error +// * MSVC14.39: #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop' +// * MSVC14.29: internal error: assertion failed: alloc_copy_of_pending_pragma: copied pragma has source sequence entry +// (pragma.c, line 526 in alloc_copy_of_pending_pragma) +// see also upstream Catch2 issue: https://github.com/catchorg/Catch2/issues/2636 +#if _CCCL_COMPILER(MSVC) +# undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +# undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +# undef CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +# define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS +#endif + namespace cuda::experimental::__async { } diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu index 4cd09badfa6..392f5fb2944 100644 --- a/cudax/test/containers/uninitialized_async_buffer.cu +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -23,7 +23,6 @@ #include #include "testing.cuh" -#include struct do_not_construct { diff --git a/cudax/test/event/event_smoke.cu b/cudax/test/event/event_smoke.cu index 58cbb638daa..45d791652a0 100644 --- a/cudax/test/event/event_smoke.cu +++ b/cudax/test/event/event_smoke.cu @@ -11,7 +11,7 @@ #include #include -#include +#include #include namespace diff --git a/cudax/test/execution/env.cu b/cudax/test/execution/env.cu index 55663ad78be..ec0985d3759 100644 --- a/cudax/test/execution/env.cu +++ b/cudax/test/execution/env.cu @@ -14,7 +14,7 @@ #include #include -#include +#include namespace cudax = cuda::experimental; using env_t = cudax::env_t; diff --git a/cudax/test/execution/policies/get_execution_policy.cu b/cudax/test/execution/policies/get_execution_policy.cu index 11c4937f410..1315a58a2a0 100644 --- a/cudax/test/execution/policies/get_execution_policy.cu +++ b/cudax/test/execution/policies/get_execution_policy.cu @@ -12,7 +12,7 @@ #include -#include +#include using cuda::experimental::execution::execution_policy; diff --git a/cudax/test/execution/policies/policies.cu b/cudax/test/execution/policies/policies.cu index 781397e2ee9..c8073cdb45f 100644 --- a/cudax/test/execution/policies/policies.cu +++ b/cudax/test/execution/policies/policies.cu @@ -12,7 +12,7 @@ #include -#include +#include namespace cudax = cuda::experimental; diff --git a/cudax/test/green_context/green_ctx_smoke.cu b/cudax/test/green_context/green_ctx_smoke.cu index 01b2571e55f..b353cb2e3c1 100644 --- a/cudax/test/green_context/green_ctx_smoke.cu +++ b/cudax/test/green_context/green_ctx_smoke.cu @@ -11,7 +11,7 @@ #include #include -#include +#include #include #if CUDART_VERSION >= 12050 @@ -23,7 +23,7 @@ TEST_CASE("Green context", "[green_context]") } else { - INFO("Can create a green context") + INFO("Can create a green context"); { { [[maybe_unused]] cudax::green_context ctx(cudax::devices[0]); @@ -35,7 +35,7 @@ TEST_CASE("Green context", "[green_context]") } } - INFO("Can create streams under green context") + INFO("Can create streams under green context"); { cudax::green_context green_ctx_dev0(cudax::devices[0]); cudax::stream stream_under_green_ctx(green_ctx_dev0); @@ -47,7 +47,7 @@ TEST_CASE("Green context", "[green_context]") CUDAX_REQUIRE(stream_dev1.device() == 1); } - INFO("Can create a side stream") + INFO("Can create a side stream"); { auto ldev1 = stream_under_green_ctx.logical_device(); CUDAX_REQUIRE(ldev1.get_kind() == cudax::logical_device::kinds::green_context); @@ -60,4 +60,10 @@ TEST_CASE("Green context", "[green_context]") } } } +#else +// For some reason CI fails with empty test, add a dummy test case +TEST_CASE("Dummy test case") +{ + CUDAX_REQUIRE(1 == 1); +} #endif // CUDART_VERSION >= 12050 diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu index 62be4f5aac6..cf359aa3318 100644 --- a/cudax/test/hierarchy/hierarchy_smoke.cu +++ b/cudax/test/hierarchy/hierarchy_smoke.cu @@ -10,6 +10,7 @@ #include +#include "testing.cuh" #include #include @@ -380,11 +381,11 @@ TEST_CASE("On device rank calculation", "[hierarchy]") CUDART(cudaMalloc((void**) &ptr, 2 * 1024 * sizeof(unsigned int))); const auto config_static = cudax::block_dims<256>() & cudax::grid_dims(dim3(2, 2, 2)); - rank_kernel<<<256, dim3(2, 2, 2)>>>(config_static, ptr); + rank_kernel<<>>(config_static, ptr); CUDART(cudaDeviceSynchronize()); - rank_kernel_cg<<<256, dim3(2, 2, 2)>>>(config_static, ptr); + rank_kernel_cg<<>>(config_static, ptr); CUDART(cudaDeviceSynchronize()); - rank_kernel_optimized<<<256, dim3(2, 2, 2)>>>(config_static, ptr); + rank_kernel_optimized<<>>(config_static, ptr); CUDART(cudaDeviceSynchronize()); CUDART(cudaFree(ptr)); } diff --git a/cudax/test/memory_resource/any_async_resource.cu b/cudax/test/memory_resource/any_async_resource.cu index c491c9efa21..9dbb898fc08 100644 --- a/cudax/test/memory_resource/any_async_resource.cu +++ b/cudax/test/memory_resource/any_async_resource.cu @@ -11,7 +11,6 @@ #include #include "test_resource.cuh" -#include #include #ifndef __CUDA_ARCH__ diff --git a/cudax/test/memory_resource/device_memory_pool.cu b/cudax/test/memory_resource/device_memory_pool.cu index 351c3d8a0ed..bbfae3385d8 100644 --- a/cudax/test/memory_resource/device_memory_pool.cu +++ b/cudax/test/memory_resource/device_memory_pool.cu @@ -18,7 +18,6 @@ #include -#include #include namespace cudax = cuda::experimental; diff --git a/cudax/test/memory_resource/device_memory_resource.cu b/cudax/test/memory_resource/device_memory_resource.cu index aefbb8b1bf7..44402c430e1 100644 --- a/cudax/test/memory_resource/device_memory_resource.cu +++ b/cudax/test/memory_resource/device_memory_resource.cu @@ -16,7 +16,7 @@ #include -#include +#include #include namespace cudax = cuda::experimental; diff --git a/cudax/test/memory_resource/get_memory_resource.cu b/cudax/test/memory_resource/get_memory_resource.cu index c61967fa7a5..389bb955624 100644 --- a/cudax/test/memory_resource/get_memory_resource.cu +++ b/cudax/test/memory_resource/get_memory_resource.cu @@ -13,7 +13,6 @@ #include #include "test_resource.cuh" -#include #include using device_resource = cuda::experimental::device_memory_resource; diff --git a/cudax/test/memory_resource/managed_memory_resource.cu b/cudax/test/memory_resource/managed_memory_resource.cu index 1c5836192ba..c0a4f66dc62 100644 --- a/cudax/test/memory_resource/managed_memory_resource.cu +++ b/cudax/test/memory_resource/managed_memory_resource.cu @@ -17,7 +17,7 @@ #include -#include +#include #include namespace cudax = cuda::experimental; diff --git a/cudax/test/memory_resource/pinned_memory_resource.cu b/cudax/test/memory_resource/pinned_memory_resource.cu index 4240491c6a3..bcbe3a315ec 100644 --- a/cudax/test/memory_resource/pinned_memory_resource.cu +++ b/cudax/test/memory_resource/pinned_memory_resource.cu @@ -18,7 +18,7 @@ #include #include "cuda/__memory_resource/resource_ref.h" -#include +#include #include namespace cudax = cuda::experimental; diff --git a/cudax/test/memory_resource/shared_resource.cu b/cudax/test/memory_resource/shared_resource.cu index cd279ab0b9e..29e6122774e 100644 --- a/cudax/test/memory_resource/shared_resource.cu +++ b/cudax/test/memory_resource/shared_resource.cu @@ -12,7 +12,6 @@ #include #include "test_resource.cuh" -#include #include TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource]", big_resource, small_resource) diff --git a/cudax/test/memory_resource/test_resource.cuh b/cudax/test/memory_resource/test_resource.cuh index 75cd9b665b2..644cace7abc 100644 --- a/cudax/test/memory_resource/test_resource.cuh +++ b/cudax/test/memory_resource/test_resource.cuh @@ -8,7 +8,6 @@ #include #include -#include #include using std::size_t; diff --git a/cudax/test/stream/get_stream.cu b/cudax/test/stream/get_stream.cu index 43cb9921990..5e0f6417ac9 100644 --- a/cudax/test/stream/get_stream.cu +++ b/cudax/test/stream/get_stream.cu @@ -10,7 +10,7 @@ #include -#include +#include #include TEST_CASE("Can call get_stream on a cudaStream_t", "[stream]") diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu index fd50ab7adf7..62cc8dcad45 100644 --- a/cudax/test/stream/stream_smoke.cu +++ b/cudax/test/stream/stream_smoke.cu @@ -11,7 +11,7 @@ #include #include -#include +#include #include TEST_CASE("Can create a stream and launch work into it", "[stream]") @@ -122,7 +122,7 @@ TEST_CASE("Stream get device", "[stream]") auto stream_ref_cudart = cudax::stream_ref(stream_handle); CUDAX_REQUIRE(stream_ref_cudart.device() == *std::prev(cudax::devices.end())); - INFO("Can create a side stream using logical device") + INFO("Can create a side stream using logical device"); { if (test::cuda_driver_version() >= 12050) { diff --git a/docs/cccl_development/macro.rst b/docs/cccl_development/macro.rst index 6bf1b0b67ab..30de6aa8b10 100644 --- a/docs/cccl_development/macro.rst +++ b/docs/cccl_development/macro.rst @@ -264,13 +264,15 @@ Usage example: **Portable Builtin Macros**: -+-----------------------------+--------------------------------------------+ -| ``_CCCL_UNREACHABLE()`` | Portable ``__builtin_unreachable()`` | -+-----------------------------+--------------------------------------------+ -| ``_CCCL_BUILTIN_ASSUME(X)`` | Portable ``__builtin_assume(X)`` | -+-----------------------------+--------------------------------------------+ -| ``_CCCL_BUILTIN_EXPECT(X)`` | Portable ``__builtin_expected(X)`` | -+-----------------------------+--------------------------------------------+ ++---------------------------------------+--------------------------------------------+ +| ``_CCCL_UNREACHABLE()`` | Portable ``__builtin_unreachable()`` | ++---------------------------------------+--------------------------------------------+ +| ``_CCCL_BUILTIN_ASSUME(X)`` | Portable ``__builtin_assume(X)`` | ++---------------------------------------+--------------------------------------------+ +| ``_CCCL_BUILTIN_EXPECT(X)`` | Portable ``__builtin_expected(X)`` | ++---------------------------------------+--------------------------------------------+ +| ``_CCCL_BUILTIN_PREFETCH(X[, Y, Z])`` | Portable ``__builtin_prefetch(X, Y, Z)`` | ++---------------------------------------+--------------------------------------------+ **Portable Keyword Macros** diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst index a0a78ed0d71..4cc639e27fb 100644 --- a/docs/cub/developer_overview.rst +++ b/docs/cub/developer_overview.rst @@ -239,8 +239,8 @@ For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implement using InternalWarpReduce = cuda::std::conditional_t< IS_POW_OF_TWO, - WarpReduceShfl, // shuffle-based implementation - WarpReduceSmem>; // smem-based implementation + detail::WarpReduceShfl, // shuffle-based implementation + detail::WarpReduceSmem>; // smem-based implementation Specializations provide different shared memory requirements, so the actual ``_TempStorage`` type is defined as: diff --git a/docs/cuda_parallel/index.rst b/docs/cuda_parallel/index.rst index e494fb1e323..c54feb81f85 100644 --- a/docs/cuda_parallel/index.rst +++ b/docs/cuda_parallel/index.rst @@ -22,3 +22,9 @@ Iterators :members: :undoc-members: :imported-members: + +Utilities +--------- + +.. automodule:: cuda.parallel.experimental.struct + :members: diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst index 5e9af18aae2..59c6068a09c 100644 --- a/docs/libcudacxx/extended_api/math.rst +++ b/docs/libcudacxx/extended_api/math.rst @@ -1,52 +1,28 @@ .. _libcudacxx-extended-api-math: Math -===== +==== -.. code:: cuda +.. toctree:: + :hidden: + :maxdepth: 1 - template - [[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept; + cuda::ceil_div + cuda::round_up + cuda::round_down -ceil_div ---------- +.. list-table:: + :widths: 25 45 30 + :header-rows: 0 -- _Requires_: `is_integral_v` is true. -- _Preconditions_: `a >= 0` is true and `b > 0` is true. -- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value. + * - :ref:`ceil_div ` + - Ceiling division + - CCCL 2.6.0 / CUDA 12.6 -.. note:: + * - :ref:`round_up ` + - Round to the next multiple + - CCCL 2.9.0 / CUDA 12.9 - The function is only constexpr from C++14 onwards - -**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: - -.. code:: cuda - - #include - #include - - __global__ void vscale(int n, float s, float *x) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) x[i] *= s; - } - - int main() { - const int n = 100000; - const float s = 2.f; - std::vector x(n, 1.f); - - // Given a fixed number of threads per block... - constexpr int threads_per_block = 256; - - // ...dividing some "n" by "threads_per_block" may lead to a remainder, - // requiring the kernel to be launched with an extra thread block to handle it. - const int thread_blocks = cuda::ceil_div(n, threads_per_block); - - vscale<<>>(n, s, x.data()); - cudaDeviceSynchronize(); - - return 0; - } - -`See it on Godbolt TODO` + * - :ref:`round_down ` + - Round to the previous multiple + - CCCL 2.9.0 / CUDA 12.9 diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst new file mode 100644 index 00000000000..df6d8c973fa --- /dev/null +++ b/docs/libcudacxx/extended_api/math/ceil_div.rst @@ -0,0 +1,52 @@ +.. _libcudacxx-extended-api-math-ceil-div: + +``ceil_div`` Ceiling Division +============================= + +.. code:: cuda + + template + [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept; + +``value``: The value to be divided. +``divisor``: The divisor. + +- *Requires*: ``is_integral_v`` is true and ``is_integral_v`` is true. +- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. +- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. + +.. note:: + + The function is only constexpr from C++14 onwards + +**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: + +.. code:: cuda + + #include + #include + + __global__ void vscale(int n, float s, float *x) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) x[i] *= s; + } + + int main() { + const int n = 100000; + const float s = 2.f; + std::vector x(n, 1.f); + + // Given a fixed number of threads per block... + constexpr int threads_per_block = 256; + + // ...dividing some "n" by "threads_per_block" may lead to a remainder, + // requiring the kernel to be launched with an extra thread block to handle it. + const int thread_blocks = cuda::ceil_div(n, threads_per_block); + + vscale<<>>(n, s, x.data()); + cudaDeviceSynchronize(); + + return 0; + } + +`See it on Godbolt TODO` diff --git a/docs/libcudacxx/extended_api/math/round_down.rst b/docs/libcudacxx/extended_api/math/round_down.rst new file mode 100644 index 00000000000..20a80998fd3 --- /dev/null +++ b/docs/libcudacxx/extended_api/math/round_down.rst @@ -0,0 +1,38 @@ +.. _libcudacxx-extended-api-math-round-down: + +``round_down`` Round to the previous multiple +============================================= + +.. code:: cuda + + template + [[nodiscard]] __host__ __device__ inline + constexpr cuda::std::common_type_t round_down(T value, U base_multiple) noexcept; + +``value``: The value to be rounded down. +``base_multiple``: The base multiple to which the value rounds down. + +- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators. +- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. +- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``. + +.. note:: + + The function requires C++17 onwards + +**Performance considerations**: + +- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer + +**Example**: + +.. code:: cuda + + #include + + __global__ void example_kernel(int a, unsigned b, unsigned* result) { + // a = 7, b = 3 -> result = 6 + *result = cuda::round_down(a, b); + } + +`See it on Godbolt TODO` diff --git a/docs/libcudacxx/extended_api/math/round_up.rst b/docs/libcudacxx/extended_api/math/round_up.rst new file mode 100644 index 00000000000..13c282aaad7 --- /dev/null +++ b/docs/libcudacxx/extended_api/math/round_up.rst @@ -0,0 +1,40 @@ +.. _libcudacxx-extended-api-math-round-up: + +``round_up`` Round to the next multiple +======================================= + +.. code:: cuda + + template + [[nodiscard]] __host__ __device__ inline + constexpr cuda::std::common_type_t round_up(T value, U base_multiple) noexcept; + +``value``: The value to be rounded up. +``base_multiple``: The base multiple to which the value rounds up. + +- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators. +- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. +- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``. +- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of + ``a`` and ``b``. The condition is checked in debug mode. + +.. note:: + + The function requires C++17 onwards + +**Performance considerations**: + +- The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication + +**Example**: + +.. code:: cuda + + #include + + __global__ void example_kernel(int a, unsigned b, unsigned* result) { + // a = 7, b = 3 -> result = 9 + *result = cuda::round_up(a, b); + } + +`See it on Godbolt TODO` diff --git a/docs/libcudacxx/standard_api.rst b/docs/libcudacxx/standard_api.rst index be806240615..dda4e56b9d9 100644 --- a/docs/libcudacxx/standard_api.rst +++ b/docs/libcudacxx/standard_api.rst @@ -14,6 +14,7 @@ Standard API standard_api/ranges_library standard_api/synchronization_library standard_api/time_library + standard_api/type_support standard_api/utility_library Standard Library Backports @@ -112,7 +113,14 @@ Feature availability: - C++26 ``std::dims`` is available in C++14. -- C++23 ``forward_like``, ``to_underlying`` and ``unreachable`` from ```` are available in C++11. +- C++26 ``std::linalg`` accessors, transposed layout, and related functions are available in C++17. + + - ``scaled()`` and ``scaled_accessor`` + - ``conjugated()`` and ``conjugated_accessor`` + - ``transposed()`` and ``layout_transpose`` + - ``conjugate_transposed()`` + +- C++23 ``forward_like``, ``to_underlying``, and ``unreachable`` from ```` are available in C++11. - C++23 ``is_scoped_enum`` in ```` is available in C++11. diff --git a/docs/libcudacxx/standard_api/c_library.rst b/docs/libcudacxx/standard_api/c_library.rst index 9751a1dcb4e..122b15998a3 100644 --- a/docs/libcudacxx/standard_api/c_library.rst +++ b/docs/libcudacxx/standard_api/c_library.rst @@ -30,3 +30,6 @@ Any Standard C++ header not listed below is omitted. * - `\ `_ - Common utilities - libcu++ 2.2.0 / CCCL 2.2.0 / CUDA 12.3 + * - `\ `_ + - Provides array manipulation functions `memcpy` and `memset` + - CCCL 3.0.0 diff --git a/docs/libcudacxx/standard_api/numerics_library.rst b/docs/libcudacxx/standard_api/numerics_library.rst index 5310cd6ddf9..4181e301864 100644 --- a/docs/libcudacxx/standard_api/numerics_library.rst +++ b/docs/libcudacxx/standard_api/numerics_library.rst @@ -10,6 +10,7 @@ Numerics Library numerics_library/bit numerics_library/complex numerics_library/numeric + numerics_library/linalg Any Standard C++ header not listed below is omitted. diff --git a/docs/libcudacxx/standard_api/numerics_library/linalg.rst b/docs/libcudacxx/standard_api/numerics_library/linalg.rst new file mode 100644 index 00000000000..cc034eeab94 --- /dev/null +++ b/docs/libcudacxx/standard_api/numerics_library/linalg.rst @@ -0,0 +1,31 @@ +.. _libcudacxx-standard-api-numerics-linalg: + +```` +============================================ + +Provided functionalities +------------------------ + +- ``scaled()`` `std::linalg::scaled `_ +- ``scaled_accessor`` `std::linalg::scaled_accessor `_ +- ``conjugated()`` `std::linalg::conjugated `_ +- ``conjugated_accessor`` `std::linalg::conjugated_accessor `_ +- ``transposed()`` `std::linalg::transposed `_ +- ``layout_transpose`` `std::linalg::layout_transpose `_ +- ``conjugate_transposed()`` `std::linalg::conjugate_transposed `_ + +Extensions +---------- + +- C++26 ``std::linalg`` accessors, transposed layout, and related functions are available in C++17 + +Omissions +--------- + +- Currently we do not expose any BLAS functions and layouts. + +Restrictions +------------ + +- On device no exceptions are thrown in case of a bad access. +- MSVC is only supported with C++20 diff --git a/docs/libcudacxx/standard_api/ranges_library.rst b/docs/libcudacxx/standard_api/ranges_library.rst index 25841bd7b5a..61e023928bc 100644 --- a/docs/libcudacxx/standard_api/ranges_library.rst +++ b/docs/libcudacxx/standard_api/ranges_library.rst @@ -13,10 +13,10 @@ See the documentation of the standard headers `\ + * - `\ `_ - Iterator related concepts and machinery such as ``cuda::std::forward_iterator`` - CCCL 2.3.0 / CUDA 12.4 - * - + * - `\ `_ - Range related concepts and machinery such as ``cuda::std::ranges::forward_range`` and ``cuda::std::ranges::subrange`` - CCCL 2.4.0 / CUDA 12.5 diff --git a/docs/libcudacxx/standard_api/type_support.rst b/docs/libcudacxx/standard_api/type_support.rst new file mode 100644 index 00000000000..52b64de6a62 --- /dev/null +++ b/docs/libcudacxx/standard_api/type_support.rst @@ -0,0 +1,24 @@ +.. _libcudacxx-standard-api-type-support: + +Type Support Library +======================= + +.. toctree:: + :hidden: + :maxdepth: 1 + +Any Standard C++ header not listed below is omitted. + +.. list-table:: + :widths: 25 45 30 + :header-rows: 1 + + * - Header + - Content + - Availability + * - `\ `_ + - Limits of integral types + - libcu++ 1.0.0 / CCCL 2.0.0 / CUDA 10.2 + * - `\ `_ + - Interface to query properties of all fundamental numeric types + - libcu++ 1.0.0 / CCCL 2.0.0 / CUDA 10.2 diff --git a/docs/repo.toml b/docs/repo.toml index e949beb6e7c..999d62a8f20 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -347,6 +347,7 @@ autodoc.mock_imports = [ "numba", "pynvjitlink", "cuda.bindings", + "cuda.cccl", "llvmlite", "numpy", ] diff --git a/examples/basic/CMakeLists.txt b/examples/basic/CMakeLists.txt index f664422335e..cc50b7d1dde 100644 --- a/examples/basic/CMakeLists.txt +++ b/examples/basic/CMakeLists.txt @@ -41,6 +41,7 @@ endif() # Creates a cmake executable target for the main program add_executable(example_project example.cu) +target_compile_features(example_project PUBLIC cuda_std_17) # "Links" the CCCL Cmake target to the `example_project` executable. This configures everything needed to use # CCCL headers, including setting up include paths, compiler flags, etc. diff --git a/libcudacxx/examples/trie.cu b/libcudacxx/examples/trie.cu index 9144b8bc41c..f14e81eaabf 100644 --- a/libcudacxx/examples/trie.cu +++ b/libcudacxx/examples/trie.cu @@ -149,19 +149,19 @@ inline void assert_(cudaError_t code, const char* file, int line) template struct managed_allocator { - typedef cuda::std::size_t size_type; - typedef cuda::std::ptrdiff_t difference_type; + using size_type = cuda::std::size_t; + using difference_type = cuda::std::ptrdiff_t; - typedef T value_type; - typedef T* pointer; // (deprecated in C++17)(removed in C++20) T* - typedef const T* const_pointer; // (deprecated in C++17)(removed in C++20) const T* - typedef T& reference; // (deprecated in C++17)(removed in C++20) T& - typedef const T& const_reference; // (deprecated in C++17)(removed in C++20) const T& + using value_type = T; + using pointer = T*; // (deprecated in C++17)(removed in C++20) T* + using const_pointer = const T*; // (deprecated in C++17)(removed in C++20) const T* + using reference = T&; // (deprecated in C++17)(removed in C++20) T& + using const_reference = const T&; // (deprecated in C++17)(removed in C++20) const T& template struct rebind { - typedef managed_allocator other; + using other = managed_allocator; }; managed_allocator() = default; template diff --git a/libcudacxx/include/cuda/__cmath/round_down.h b/libcudacxx/include/cuda/__cmath/round_down.h new file mode 100644 index 00000000000..7bf42f050f9 --- /dev/null +++ b/libcudacxx/include/cuda/__cmath/round_down.h @@ -0,0 +1,103 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___CMATH_ROUND_DOWN_H +#define _CUDA___CMATH_ROUND_DOWN_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +//! @brief Round the number \p __a to the previous multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> +round_down(const _Tp __a, const _Up __b) noexcept +{ + _CCCL_ASSERT(__b > _Up{0}, "cuda::round_down: 'b' must be positive"); + if constexpr (_CUDA_VSTD::is_signed_v<_Tp>) + { + _CCCL_ASSERT(__a >= _Tp{0}, "cuda::round_down: 'a' must be non negative"); + } + using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>; + using _Prom = decltype(_Tp{} / _Up{}); + using _UProm = _CUDA_VSTD::make_unsigned_t<_Prom>; + auto __c1 = static_cast<_UProm>(__a) / static_cast<_UProm>(__b); + return static_cast<_Common>(__c1 * static_cast<_UProm>(__b)); +} + +//! @brief Round the number \p __a to the previous multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>> +round_down(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::round_down(__a, _CUDA_VSTD::to_underlying(__b)); +} + +//! @brief Round the number \p __a to the previous multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up> +round_down(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), __b); +} + +//! @brief Round the number \p __a to the previous multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) +_CCCL_NODISCARD +_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, + _CUDA_VSTD::underlying_type_t<_Up>> +round_down(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b)); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_STD_VER >= 2017 +#endif // _CUDA___CMATH_ROUND_DOWN_H diff --git a/libcudacxx/include/cuda/__cmath/round_up.h b/libcudacxx/include/cuda/__cmath/round_up.h new file mode 100644 index 00000000000..cf9bb9975f5 --- /dev/null +++ b/libcudacxx/include/cuda/__cmath/round_up.h @@ -0,0 +1,105 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___CMATH_ROUND_UP_H +#define _CUDA___CMATH_ROUND_UP_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +//! @brief Round the number \p __a to the next multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> +round_up(const _Tp __a, const _Up __b) noexcept +{ + _CCCL_ASSERT(__b > _Up{0}, "cuda::round_up: 'b' must be positive"); + if constexpr (_CUDA_VSTD::is_signed_v<_Tp>) + { + _CCCL_ASSERT(__a >= _Tp{0}, "cuda::round_up: 'a' must be non negative"); + } + using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>; + using _Prom = decltype(_Tp{} / _Up{}); + auto __c = ::cuda::ceil_div(static_cast<_Prom>(__a), static_cast<_Prom>(__b)); + _CCCL_ASSERT(static_cast<_Common>(__c) <= _CUDA_VSTD::numeric_limits<_Common>::max() / static_cast<_Common>(__b), + "cuda::round_up: result overflow"); + return static_cast<_Common>(static_cast<_Prom>(__c) * static_cast<_Prom>(__b)); +} + +//! @brief Round the number \p __a to the next multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>> +round_up(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::round_up(__a, _CUDA_VSTD::to_underlying(__b)); +} + +//! @brief Round the number \p __a to the next multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up> +round_up(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::round_up(_CUDA_VSTD::to_underlying(__a), __b); +} + +//! @brief Round the number \p __a to the next multiple of \p __b +//! @param __a The input number +//! @param __b The multiplicand +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) +_CCCL_NODISCARD +_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, + _CUDA_VSTD::underlying_type_t<_Up>> +round_up(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::round_up(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b)); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CCCL_STD_VER >= 2017 +#endif // _CUDA___CMATH_ROUND_UP_H diff --git a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h index cb8fcb69083..72c413d65a4 100644 --- a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h +++ b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include @@ -135,7 +135,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memc ( // Host code path: if (__group.thread_rank() == 0) { - memcpy(__dest_char, __src_char, __size); + _CUDA_VSTD::memcpy(__dest_char, __src_char, __size); } return __completion_mechanism::__sync;)); } diff --git a/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h b/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h index 9ce2b455d59..f5af15bd51a 100644 --- a/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h +++ b/libcudacxx/include/cuda/__ptx/ptx_helper_functions.h @@ -26,6 +26,8 @@ #include #include +#if _CCCL_HAS_CUDA_COMPILER + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX template @@ -103,9 +105,9 @@ inline _CCCL_DEVICE _Tp* __from_ptr_gmem(_CUDA_VSTD::size_t __ptr) template inline _CCCL_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val) { -#if _CCCL_STD_VER >= 2017 +# if _CCCL_STD_VER >= 2017 static_assert(sizeof(_Tp) == 4, ""); -#endif // _CCCL_STD_VER >= 2017 +# endif // _CCCL_STD_VER >= 2017 // Consider using std::bitcast return *reinterpret_cast<_CUDA_VSTD::uint32_t*>(&__val); } @@ -113,13 +115,15 @@ inline _CCCL_DEVICE _CUDA_VSTD::uint32_t __as_b32(_Tp __val) template inline _CCCL_DEVICE _CUDA_VSTD::uint64_t __as_b64(_Tp __val) { -#if _CCCL_STD_VER >= 2017 +# if _CCCL_STD_VER >= 2017 static_assert(sizeof(_Tp) == 8, ""); -#endif // _CCCL_STD_VER >= 2017 +# endif // _CCCL_STD_VER >= 2017 // Consider using std::bitcast return *reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__val); } _LIBCUDACXX_END_NAMESPACE_CUDA_PTX +#endif // _CCCL_HAS_CUDA_COMPILER + #endif // _CUDA_PTX_HELPER_FUNCTIONS_H_ diff --git a/libcudacxx/include/cuda/__type_traits/is_floating_point.h b/libcudacxx/include/cuda/__type_traits/is_floating_point.h new file mode 100644 index 00000000000..e253315a672 --- /dev/null +++ b/libcudacxx/include/cuda/__type_traits/is_floating_point.h @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDA__TYPE_TRAITS_IS_FLOATING_POINT_H +#define __CUDA__TYPE_TRAITS_IS_FLOATING_POINT_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_floating_point + : _CUDA_VSTD::bool_constant<_CUDA_VSTD::is_floating_point<_CUDA_VSTD::remove_cv_t<_Tp>>::value + || _CUDA_VSTD::__is_extended_floating_point<_CUDA_VSTD::remove_cv_t<_Tp>>::value> +{}; + +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) +template +_CCCL_INLINE_VAR constexpr bool is_floating_point_v = + _CUDA_VSTD::is_floating_point_v<_CUDA_VSTD::remove_cv_t<_Tp>> + || _CUDA_VSTD::__is_extended_floating_point_v<_CUDA_VSTD::remove_cv_t<_Tp>>; +#endif // !_CCCL_NO_VARIABLE_TEMPLATES + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // __CUDA__TYPE_TRAITS_IS_FLOATING_POINT_H diff --git a/libcudacxx/include/cuda/cmath b/libcudacxx/include/cuda/cmath index 3de1cc6e920..20683c8676f 100644 --- a/libcudacxx/include/cuda/cmath +++ b/libcudacxx/include/cuda/cmath @@ -22,6 +22,8 @@ #endif // no system header #include +#include +#include #include #endif // _CUDA_CMATH diff --git a/libcudacxx/include/cuda/discard_memory b/libcudacxx/include/cuda/discard_memory index 6da2ea209c4..5177b7ee407 100644 --- a/libcudacxx/include/cuda/discard_memory +++ b/libcudacxx/include/cuda/discard_memory @@ -21,11 +21,12 @@ # pragma system_header #endif // no system header +#include #include _LIBCUDACXX_BEGIN_NAMESPACE_CUDA -inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbytes) noexcept +inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, _CUDA_VSTD::size_t __nbytes) noexcept { // The discard PTX instruction is only available with PTX ISA 7.4 and later #if __cccl_ptx_isa < 740ULL diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index 7946e8bdc91..a96beb3a520 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -141,6 +141,8 @@ # pragma system_header #endif // no system header +#include +#include #include #include #include diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h index f4013d4ea73..c42ed39fd5f 100644 --- a/libcudacxx/include/cuda/std/__algorithm/copy.h +++ b/libcudacxx/include/cuda/std/__algorithm/copy.h @@ -28,8 +28,8 @@ #include #include #include -#include // ::memmove -#include +#include +#include // memmove _LIBCUDACXX_BEGIN_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__algorithm/for_each_n.h b/libcudacxx/include/cuda/std/__algorithm/for_each_n.h index 54428acb69d..ceca1e67af3 100644 --- a/libcudacxx/include/cuda/std/__algorithm/for_each_n.h +++ b/libcudacxx/include/cuda/std/__algorithm/for_each_n.h @@ -28,8 +28,8 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _InputIterator for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) { - typedef decltype(_CUDA_VSTD::__convert_to_integral(__orig_n)) _IntegralSize; - _IntegralSize __n = __orig_n; + using _IntegralSize = decltype(_CUDA_VSTD::__convert_to_integral(__orig_n)); + _IntegralSize __n = __orig_n; while (__n > 0) { __f(*__first); diff --git a/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h b/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h index ace9c539664..f62e0c9d5ed 100644 --- a/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h +++ b/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h @@ -30,7 +30,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp) { - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + using difference_type = typename iterator_traits<_RandomAccessIterator>::difference_type; difference_type __len = __last - __first; difference_type __p = 0; difference_type __c = 1; diff --git a/libcudacxx/include/cuda/std/__algorithm/partition_point.h b/libcudacxx/include/cuda/std/__algorithm/partition_point.h index 446c74bad42..f4c17edbf55 100644 --- a/libcudacxx/include/cuda/std/__algorithm/partition_point.h +++ b/libcudacxx/include/cuda/std/__algorithm/partition_point.h @@ -31,7 +31,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator partition_point(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { - typedef typename iterator_traits<_ForwardIterator>::difference_type difference_type; + using difference_type = typename iterator_traits<_ForwardIterator>::difference_type; difference_type __len = _CUDA_VSTD::distance(__first, __last); while (__len != 0) { diff --git a/libcudacxx/include/cuda/std/__algorithm/rotate.h b/libcudacxx/include/cuda/std/__algorithm/rotate.h index 2aef1790ba0..709b6cddc0f 100644 --- a/libcudacxx/include/cuda/std/__algorithm/rotate.h +++ b/libcudacxx/include/cuda/std/__algorithm/rotate.h @@ -35,8 +35,8 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __rotate_left(_ForwardIterator __first, _ForwardIterator __last) { - typedef typename iterator_traits<_ForwardIterator>::value_type value_type; - using _Ops = _IterOps<_AlgPolicy>; + using value_type = typename iterator_traits<_ForwardIterator>::value_type; + using _Ops = _IterOps<_AlgPolicy>; value_type __tmp = _Ops::__iter_move(__first); _ForwardIterator __lm1 = _CUDA_VSTD::__move<_AlgPolicy>(_Ops::next(__first), __last, __first).second; @@ -48,8 +48,8 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last) { - typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type; - using _Ops = _IterOps<_AlgPolicy>; + using value_type = typename iterator_traits<_BidirectionalIterator>::value_type; + using _Ops = _IterOps<_AlgPolicy>; _BidirectionalIterator __lm1 = _Ops::prev(__last); value_type __tmp = _Ops::__iter_move(__lm1); @@ -118,9 +118,9 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last) { - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; - using _Ops = _IterOps<_AlgPolicy>; + using difference_type = typename iterator_traits<_RandomAccessIterator>::difference_type; + using value_type = typename iterator_traits<_RandomAccessIterator>::value_type; + using _Ops = _IterOps<_AlgPolicy>; const difference_type __m1 = __middle - __first; const difference_type __m2 = _Ops::distance(__middle, __last); @@ -158,7 +158,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __rotate_impl( _ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _CUDA_VSTD::forward_iterator_tag) { - typedef typename iterator_traits<_ForwardIterator>::value_type value_type; + using value_type = typename iterator_traits<_ForwardIterator>::value_type; if (_CCCL_TRAIT(is_trivially_move_assignable, value_type)) { if (_IterOps<_AlgPolicy>::next(__first) == __middle) @@ -176,7 +176,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator __rotate_ _BidirectionalIterator __last, bidirectional_iterator_tag) { - typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type; + using value_type = typename iterator_traits<_BidirectionalIterator>::value_type; if (_CCCL_TRAIT(is_trivially_move_assignable, value_type)) { if (_IterOps<_AlgPolicy>::next(__first) == __middle) @@ -198,7 +198,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __rotate_i _RandomAccessIterator __last, random_access_iterator_tag) { - typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; + using value_type = typename iterator_traits<_RandomAccessIterator>::value_type; if (_CCCL_TRAIT(is_trivially_move_assignable, value_type)) { if (_IterOps<_AlgPolicy>::next(__first) == __middle) diff --git a/libcudacxx/include/cuda/std/__algorithm/search.h b/libcudacxx/include/cuda/std/__algorithm/search.h index 078ac059773..1b4d5ed316f 100644 --- a/libcudacxx/include/cuda/std/__algorithm/search.h +++ b/libcudacxx/include/cuda/std/__algorithm/search.h @@ -93,8 +93,8 @@ __search(_RandomAccessIterator1 __first1, random_access_iterator_tag, random_access_iterator_tag) { - typedef typename iterator_traits<_RandomAccessIterator1>::difference_type _Diff1; - typedef typename iterator_traits<_RandomAccessIterator2>::difference_type _Diff2; + using _Diff1 = typename iterator_traits<_RandomAccessIterator1>::difference_type; + using _Diff2 = typename iterator_traits<_RandomAccessIterator2>::difference_type; // Take advantage of knowing source and pattern lengths. Stop short when source is smaller than pattern const _Diff2 __len2 = __last2 - __first2; if (__len2 == 0) diff --git a/libcudacxx/include/cuda/std/__algorithm/sift_down.h b/libcudacxx/include/cuda/std/__algorithm/sift_down.h index d0a8f2e75aa..421728039b3 100644 --- a/libcudacxx/include/cuda/std/__algorithm/sift_down.h +++ b/libcudacxx/include/cuda/std/__algorithm/sift_down.h @@ -35,8 +35,8 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __sift_down( { using _Ops = _IterOps<_AlgPolicy>; - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; + using difference_type = typename iterator_traits<_RandomAccessIterator>::difference_type; + using value_type = typename iterator_traits<_RandomAccessIterator>::value_type; // left-child of __start is at 2 * __start + 1 // right-child of __start is at 2 * __start + 2 difference_type __child = __start - __first; diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h index 2ecd56daf55..be06fbd34d1 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h @@ -22,6 +22,7 @@ #include #include +#include // This file works around a bug in CUDA in which the compiler miscompiles // atomics to automatic storage (local memory). This bug is not fixed on any @@ -96,7 +97,7 @@ _CCCL_DEVICE inline bool __cuda_load_weak_if_local(const volatile void* __ptr, v { return false; } - memcpy(__ret, const_cast(__ptr), __size); + _CUDA_VSTD::memcpy(__ret, const_cast(__ptr), __size); // Required to workaround a compiler bug, see nvbug/4064730 NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);)) return true; @@ -108,7 +109,7 @@ _CCCL_DEVICE inline bool __cuda_store_weak_if_local(volatile void* __ptr, const { return false; } - memcpy(const_cast(__ptr), __val, __size); + _CUDA_VSTD::memcpy(const_cast(__ptr), __val, __size); return true; } @@ -122,12 +123,12 @@ __cuda_compare_exchange_weak_if_local(volatile _Type* __ptr, _Type* __expected, } if (__atomic_memcmp(const_cast(__ptr), const_cast(__expected), sizeof(_Type)) == 0) { - memcpy(const_cast<_Type*>(__ptr), const_cast<_Type const*>(__desired), sizeof(_Type)); + _CUDA_VSTD::memcpy(const_cast<_Type*>(__ptr), const_cast<_Type const*>(__desired), sizeof(_Type)); *__success = true; } else { - memcpy(const_cast<_Type*>(__expected), const_cast<_Type const*>(__ptr), sizeof(_Type)); + _CUDA_VSTD::memcpy(const_cast<_Type*>(__expected), const_cast<_Type const*>(__ptr), sizeof(_Type)); *__success = false; } NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);)) @@ -141,8 +142,8 @@ _CCCL_DEVICE bool __cuda_exchange_weak_if_local(volatile _Type* __ptr, _Type* __ { return false; } - memcpy(const_cast<_Type*>(__ret), const_cast(__ptr), sizeof(_Type)); - memcpy(const_cast<_Type*>(__ptr), const_cast(__val), sizeof(_Type)); + _CUDA_VSTD::memcpy(const_cast<_Type*>(__ret), const_cast(__ptr), sizeof(_Type)); + _CUDA_VSTD::memcpy(const_cast<_Type*>(__ptr), const_cast(__val), sizeof(_Type)); NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);)) return true; } @@ -154,7 +155,7 @@ _CCCL_DEVICE bool __cuda_fetch_weak_if_local(volatile _Type* __ptr, _Type __val, { return false; } - memcpy(const_cast<_Type*>(__ret), const_cast(__ptr), sizeof(_Type)); + _CUDA_VSTD::memcpy(const_cast<_Type*>(__ret), const_cast(__ptr), sizeof(_Type)); __bop(*__ptr, __val); NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);)) return true; diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h index 27136bb3244..bafbd86fe5f 100644 --- a/libcudacxx/include/cuda/std/__atomic/order.h +++ b/libcudacxx/include/cuda/std/__atomic/order.h @@ -84,15 +84,14 @@ inline constexpr auto memory_order_seq_cst = memory_order::seq_cst; #else // ^^^ C++20 ^^^ / vvv C++17 vvv -typedef enum memory_order -{ +using memory_order = enum memory_order { memory_order_relaxed = __mo_relaxed, memory_order_consume = __mo_consume, memory_order_acquire = __mo_acquire, memory_order_release = __mo_release, memory_order_acq_rel = __mo_acq_rel, memory_order_seq_cst = __mo_seq_cst, -} memory_order; +}; #endif // _CCCL_STD_VER >= 2020 diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h index 6706ad5181b..5d1f5f2d654 100644 --- a/libcudacxx/include/cuda/std/__atomic/types/common.h +++ b/libcudacxx/include/cuda/std/__atomic/types/common.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -92,7 +92,7 @@ _CCCL_HOST_DEVICE inline int __atomic_memcmp(void const* __lhs, void const* __rh } } return 0;), NV_IS_HOST, - (return memcmp(__lhs, __rhs, __count);)) + (return _CUDA_VSTD::memcmp(__lhs, __rhs, __count);)) } _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h index a4e969f0936..e9ce704b4c1 100644 --- a/libcudacxx/include/cuda/std/__atomic/types/small.h +++ b/libcudacxx/include/cuda/std/__atomic/types/small.h @@ -28,6 +28,7 @@ #include #include #include +#include _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -53,7 +54,7 @@ template = 0> _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) { __atomic_small_proxy_t<_Tp> __temp{}; - memcpy(&__temp, &__val, sizeof(_Tp)); + _CUDA_VSTD::memcpy(&__temp, &__val, sizeof(_Tp)); return __temp; } @@ -61,7 +62,7 @@ template = 0> _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) { _Tp __temp{}; - memcpy(&__temp, &__val, sizeof(_Tp)); + _CUDA_VSTD::memcpy(&__temp, &__val, sizeof(_Tp)); return __temp; } diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h index 1fc40a07665..00646e47984 100644 --- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h +++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h @@ -25,6 +25,10 @@ #include #include +#if !_CCCL_COMPILER(NVRTC) +# include +#endif // !_CCCL_COMPILER(NVRTC) + _LIBCUDACXX_BEGIN_NAMESPACE_STD extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__(); @@ -56,7 +60,7 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp c #if _CCCL_HAS_CUDA_COMPILER return __lhs == __rhs; #else - return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0; + return _CUDA_VSTD::memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0; #endif } diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h index a0579942d90..7e265232d0b 100644 --- a/libcudacxx/include/cuda/std/__bit/bit_cast.h +++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include _LIBCUDACXX_BEGIN_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__bit/reference.h b/libcudacxx/include/cuda/std/__bit/reference.h index 12acac014b1..13a5898014d 100644 --- a/libcudacxx/include/cuda/std/__bit/reference.h +++ b/libcudacxx/include/cuda/std/__bit/reference.h @@ -30,7 +30,6 @@ #include #include #include -#include _CCCL_PUSH_MACROS diff --git a/libcudacxx/include/cuda/std/__cccl/assert.h b/libcudacxx/include/cuda/std/__cccl/assert.h index 5ef9314f310..b8acc604a65 100644 --- a/libcudacxx/include/cuda/std/__cccl/assert.h +++ b/libcudacxx/include/cuda/std/__cccl/assert.h @@ -122,13 +122,14 @@ _CCCL_HOST_DEVICE //! _CCCL_VERIFY is enabled unconditionally and reserved for critical checks that are required to always be on //! _CCCL_ASSERT is enabled conditionally depending on CCCL_ENABLE_HOST_ASSERTIONS and CCCL_ENABLE_DEVICE_ASSERTIONS -#if _CCCL_CUDA_COMPILER(NVHPC) // NVHPC needs to use NV_IF_TARGET instead of __CUDA_ARCH__ -# define _CCCL_VERIFY(expression, message) \ - NV_IF_ELSE_TARGET( \ - NV_IS_DEVICE, (_CCCL_ASSERT_IMPL_DEVICE(expression, message);), (_CCCL_ASSERT_IMPL_HOST(expression, message);)) -# define _CCCL_ASSERT(expression, message) \ - NV_IF_ELSE_TARGET( \ - NV_IS_DEVICE, (_CCCL_ASSERT_DEVICE(expression, message);), (_CCCL_ASSERT_HOST(expression, message);)) +#if _CCCL_CUDA_COMPILER(NVHPC) // NVHPC can't have different behavior for host and device. + // The host version of the assert will also work in device code. +# define _CCCL_VERIFY(expression, message) _CCCL_ASSERT_IMPL_HOST(expression, message) +# if defined(CCCL_ENABLE_HOST_ASSERTIONS) || defined(CCCL_ENABLE_DEVICE_ASSERTIONS) +# define _CCCL_ASSERT(expression, message) _CCCL_ASSERT_HOST(expression, message) +# else +# define _CCCL_ASSERT(expression, message) ((void) 0) +# endif #elif _CCCL_HAS_CUDA_COMPILER # ifdef __CUDA_ARCH__ # define _CCCL_VERIFY(expression, message) _CCCL_ASSERT_IMPL_DEVICE(expression, message) diff --git a/libcudacxx/include/cuda/std/__cccl/attributes.h b/libcudacxx/include/cuda/std/__cccl/attributes.h index a5888cc289e..79f9cadbdc1 100644 --- a/libcudacxx/include/cuda/std/__cccl/attributes.h +++ b/libcudacxx/include/cuda/std/__cccl/attributes.h @@ -134,4 +134,10 @@ # define _CCCL_RESTRICT __restrict__ #endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^ +#if _CCCL_HAS_CPP_ATTRIBUTE(assume) +# define _CCCL_ASSUME(...) [[assume(__VA_ARGS__)]] +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv +# define _CCCL_ASSUME(...) _CCCL_BUILTIN_ASSUME(__VA_ARGS__) +#endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^ + #endif // __CCCL_ATTRIBUTES_H diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index 3a5fda2f0f5..aee334b5562 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -101,10 +101,22 @@ # define _CCCL_BUILTIN_ADDRESSOF(...) __builtin_addressof(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_addressof) -#if _CCCL_CHECK_BUILTIN(builtin_assume) +#if _CCCL_CHECK_BUILTIN(builtin_assume) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) # define _CCCL_BUILTIN_ASSUME(...) __builtin_assume(__VA_ARGS__) +#elif _CCCL_COMPILER(GCC, >=, 13) +# define _CCCL_BUILTIN_ASSUME(...) \ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);), (__attribute__((__assume__(__VA_ARGS__)));)) +#elif _CCCL_COMPILER(MSVC) +# define _CCCL_BUILTIN_ASSUME(...) \ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);), (__assume(__VA_ARGS__);)) #endif // _CCCL_CHECK_BUILTIN(builtin_assume) +#if _CCCL_CHECK_BUILTIN(builtin_prefetch) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_PREFETCH(...) NV_IF_TARGET(NV_IS_HOST, __builtin_prefetch(__VA_ARGS__);) +#else +# define _CCCL_BUILTIN_PREFETCH(...) +#endif // _CCCL_CHECK_BUILTIN(builtin_prefetch) + // NVCC prior to 11.2 cannot handle __builtin_assume #if _CCCL_CUDACC_BELOW(11, 2) # undef _CCCL_BUILTIN_ASSUME @@ -150,6 +162,33 @@ # undef _CCCL_BUILTIN_BSWAP128 #endif // _CCCL_CUDA_COMPILER(NVCC) +#if _CCCL_CHECK_BUILTIN(builtin_cbrt) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_CBRTF(...) __builtin_cbrtf(__VA_ARGS__) +# define _CCCL_BUILTIN_CBRT(...) __builtin_cbrt(__VA_ARGS__) +# define _CCCL_BUILTIN_CBRTL(...) __builtin_cbrtl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_cbrt) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "cbrt" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_CBRTF +# undef _CCCL_BUILTIN_CBRT +# undef _CCCL_BUILTIN_CBRTL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_ceil) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_CEILF(...) __builtin_ceilf(__VA_ARGS__) +# define _CCCL_BUILTIN_CEIL(...) __builtin_ceil(__VA_ARGS__) +# define _CCCL_BUILTIN_CEILL(...) __builtin_ceill(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_ceil) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_CEILF +# undef _CCCL_BUILTIN_CEIL +# undef _CCCL_BUILTIN_CEILL +#endif // _CCCL_CUDACC_BELOW(11, 7) + #if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_COLUMN() __builtin_COLUMN() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_COLUMN) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_COLUMN) vvv @@ -162,14 +201,69 @@ # define _CCCL_BUILTIN_COLUMN() 0 #endif // _CCCL_CUDACC_BELOW(11, 3) -#if _CCCL_CHECK_BUILTIN(builtin_contant_p) || _CCCL_COMPILER(GCC) +#if _CCCL_CHECK_BUILTIN(builtin_constant_p) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__) -#endif // _CCCL_CHECK_BUILTIN(builtin_contant_p) +#endif // _CCCL_CHECK_BUILTIN(builtin_constant_p) + +#if _CCCL_CHECK_BUILTIN(builtin_exp) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_EXPF(...) __builtin_expf(__VA_ARGS__) +# define _CCCL_BUILTIN_EXP(...) __builtin_exp(__VA_ARGS__) +# define _CCCL_BUILTIN_EXPL(...) __builtin_expl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_exp) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "expf" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_EXPF +# undef _CCCL_BUILTIN_EXP +# undef _CCCL_BUILTIN_EXPL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_exp2) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_EXP2F(...) __builtin_exp2f(__VA_ARGS__) +# define _CCCL_BUILTIN_EXP2(...) __builtin_exp2(__VA_ARGS__) +# define _CCCL_BUILTIN_EXP2L(...) __builtin_exp2l(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_exp2) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "exp2" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_EXP2F +# undef _CCCL_BUILTIN_EXP2 +# undef _CCCL_BUILTIN_EXP2L +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_expm1) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_EXPM1F(...) __builtin_expm1f(__VA_ARGS__) +# define _CCCL_BUILTIN_EXPM1(...) __builtin_expm1(__VA_ARGS__) +# define _CCCL_BUILTIN_EXPM1L(...) __builtin_expm1l(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_expm1) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "expm1" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_EXPM1F +# undef _CCCL_BUILTIN_EXPM1 +# undef _CCCL_BUILTIN_EXPM1L +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) #if _CCCL_CHECK_BUILTIN(builtin_expect) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_EXPECT(...) __builtin_expect(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_expect) +#if _CCCL_CHECK_BUILTIN(builtin_floor) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_FLOORF(...) __builtin_floorf(__VA_ARGS__) +# define _CCCL_BUILTIN_FLOOR(...) __builtin_floor(__VA_ARGS__) +# define _CCCL_BUILTIN_FLOORL(...) __builtin_floorl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_floor) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_FLOORF +# undef _CCCL_BUILTIN_FLOOR +# undef _CCCL_BUILTIN_FLOORL +#endif // _CCCL_CUDACC_BELOW(11, 7) + #if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_FMAXF(...) __builtin_fmaxf(__VA_ARGS__) # define _CCCL_BUILTIN_FMAX(...) __builtin_fmax(__VA_ARGS__) @@ -217,6 +311,20 @@ # undef _CCCL_BUILTIN_FPCLASSIFY #endif // _CCCL_CUDACC_BELOW(11, 7) +#if _CCCL_CHECK_BUILTIN(builtin_frexp) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_FREXPF(...) __builtin_frexpf(__VA_ARGS__) +# define _CCCL_BUILTIN_FREXP(...) __builtin_frexp(__VA_ARGS__) +# define _CCCL_BUILTIN_FREXPL(...) __builtin_frexpl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_frexp) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "frexp" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_FREXPF +# undef _CCCL_BUILTIN_FREXP +# undef _CCCL_BUILTIN_FREXPL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + #if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_FUNCTION() __builtin_FUNCTION() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FUNCTION) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FUNCTION) vvv @@ -229,6 +337,20 @@ # define _CCCL_BUILTIN_FUNCTION() "__builtin_FUNCTION is unsupported" #endif // _CCCL_CUDACC_BELOW(11, 3) +#if _CCCL_CHECK_BUILTIN(builtin_huge_valf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10) +# define _CCCL_BUILTIN_HUGE_VALF() __builtin_huge_valf() +#endif // _CCCL_CHECK_BUILTIN(builtin_huge_valf) + +#if _CCCL_CHECK_BUILTIN(builtin_huge_val) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10) +# define _CCCL_BUILTIN_HUGE_VAL() __builtin_huge_val() +#endif // _CCCL_CHECK_BUILTIN(builtin_huge_val) + +#if _CCCL_CHECK_BUILTIN(builtin_huge_vall) || _CCCL_COMPILER(GCC, <, 10) +# define _CCCL_BUILTIN_HUGE_VALL() __builtin_huge_vall() +#elif _CCCL_COMPILER(MSVC) +# define _CCCL_BUILTIN_HUGE_VALL() static_cast(__builtin_huge_val()) +#endif // _CCCL_CHECK_BUILTIN(builtin_huge_vall) + #if _CCCL_CHECK_BUILTIN(builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \ || (_CCCL_COMPILER(MSVC, >, 19, 24) && _CCCL_CUDACC_AT_LEAST(11, 3)) # define _CCCL_BUILTIN_IS_CONSTANT_EVALUATED(...) __builtin_is_constant_evaluated(__VA_ARGS__) @@ -276,6 +398,20 @@ # undef _CCCL_BUILTIN_LAUNDER #endif // clang < 10 || nvcc < 11.3 +#if _CCCL_CHECK_BUILTIN(builtin_ldexp) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LDEXPF(...) __builtin_ldexpf(__VA_ARGS__) +# define _CCCL_BUILTIN_LDEXP(...) __builtin_ldexp(__VA_ARGS__) +# define _CCCL_BUILTIN_LDEXPL(...) __builtin_ldexpl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_ldexp) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "ldexp" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_LDEXPF +# undef _CCCL_BUILTIN_LDEXP +# undef _CCCL_BUILTIN_LDEXPL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + #if _CCCL_HAS_BUILTIN(__builtin_LINE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_LINE() __builtin_LINE() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_LINE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_LINE) vvv @@ -288,19 +424,60 @@ # define _CCCL_BUILTIN_LINE() __LINE__ #endif // _CCCL_CUDACC_BELOW(11, 3) -#if _CCCL_CHECK_BUILTIN(builtin_huge_valf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10) -# define _CCCL_BUILTIN_HUGE_VALF() __builtin_huge_valf() -#endif // _CCCL_CHECK_BUILTIN(builtin_huge_valf) +#if _CCCL_CHECK_BUILTIN(builtin_llrint) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LLRINTF(...) __builtin_llrintf(__VA_ARGS__) +# define _CCCL_BUILTIN_LLRINT(...) __builtin_llrint(__VA_ARGS__) +# define _CCCL_BUILTIN_LLRINTL(...) __builtin_llrintl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_llrint) -#if _CCCL_CHECK_BUILTIN(builtin_huge_val) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10) -# define _CCCL_BUILTIN_HUGE_VAL() __builtin_huge_val() -#endif // _CCCL_CHECK_BUILTIN(builtin_huge_val) +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "llrint" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_LLRINTF +# undef _CCCL_BUILTIN_LLRINT +# undef _CCCL_BUILTIN_LLRINTL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) -#if _CCCL_CHECK_BUILTIN(builtin_huge_vall) || _CCCL_COMPILER(GCC, <, 10) -# define _CCCL_BUILTIN_HUGE_VALL() __builtin_huge_vall() -#elif _CCCL_COMPILER(MSVC) -# define _CCCL_BUILTIN_HUGE_VALL() static_cast(__builtin_huge_val()) -#endif // _CCCL_CHECK_BUILTIN(builtin_huge_vall) +#if _CCCL_CHECK_BUILTIN(builtin_llround) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LLROUNDF(...) __builtin_llroundf(__VA_ARGS__) +# define _CCCL_BUILTIN_LLROUND(...) __builtin_llround(__VA_ARGS__) +# define _CCCL_BUILTIN_LLROUNDL(...) __builtin_llroundl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_llround) + +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "llround" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_LLROUNDF +# undef _CCCL_BUILTIN_LLROUND +# undef _CCCL_BUILTIN_LLROUNDL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_lrint) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LRINTF(...) __builtin_lrintf(__VA_ARGS__) +# define _CCCL_BUILTIN_LRINT(...) __builtin_lrint(__VA_ARGS__) +# define _CCCL_BUILTIN_LRINTL(...) __builtin_lrintl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_lrint) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "lrint" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_LRINTF +# undef _CCCL_BUILTIN_LRINT +# undef _CCCL_BUILTIN_LRINTL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_lround) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LROUNDF(...) __builtin_lroundf(__VA_ARGS__) +# define _CCCL_BUILTIN_LROUND(...) __builtin_lround(__VA_ARGS__) +# define _CCCL_BUILTIN_LROUNDL(...) __builtin_lroundl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_lround) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "lround" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_LROUNDF +# undef _CCCL_BUILTIN_LROUND +# undef _CCCL_BUILTIN_LROUNDL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) #if _CCCL_CHECK_BUILTIN(builtin_nanf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10) # define _CCCL_BUILTIN_NANF(...) __builtin_nanf(__VA_ARGS__) @@ -330,6 +507,46 @@ # define _CCCL_BUILTIN_NANSL(...) static_cast(__builtin_nans(__VA_ARGS__)) #endif // _CCCL_CHECK_BUILTIN(builtin_nansl) +#if _CCCL_CHECK_BUILTIN(builtin_nearbyint) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_NEARBYINTF(...) __builtin_nearbyintf(__VA_ARGS__) +# define _CCCL_BUILTIN_NEARBYINT(...) __builtin_nearbyint(__VA_ARGS__) +# define _CCCL_BUILTIN_NEARBYINTL(...) __builtin_nearbyintl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_nearbyint) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_NEARBYINTF +# undef _CCCL_BUILTIN_NEARBYINT +# undef _CCCL_BUILTIN_NEARBYINTL +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_nextafter) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_NEXTAFTERF(...) __builtin_nextafterf(__VA_ARGS__) +# define _CCCL_BUILTIN_NEXTAFTER(...) __builtin_nextafter(__VA_ARGS__) +# define _CCCL_BUILTIN_NEXTAFTERL(...) __builtin_nextafterl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_nextafter) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "nextafter" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_NEXTAFTERF +# undef _CCCL_BUILTIN_NEXTAFTER +# undef _CCCL_BUILTIN_NEXTAFTERL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_nexttoward) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_NEXTTOWARDF(...) __builtin_nexttowardf(__VA_ARGS__) +# define _CCCL_BUILTIN_NEXTTOWARD(...) __builtin_nexttoward(__VA_ARGS__) +# define _CCCL_BUILTIN_NEXTTOWARDL(...) __builtin_nexttowardl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_nexttoward) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_NEXTTOWARDF +# undef _CCCL_BUILTIN_NEXTTOWARD +# undef _CCCL_BUILTIN_NEXTTOWARDL +#endif // _CCCL_CUDACC_BELOW(11, 7) + #if _CCCL_CHECK_BUILTIN(builtin_log) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_LOGF(...) __builtin_logf(__VA_ARGS__) # define _CCCL_BUILTIN_LOG(...) __builtin_log(__VA_ARGS__) @@ -356,7 +573,7 @@ # undef _CCCL_BUILTIN_LOG10F # undef _CCCL_BUILTIN_LOG10 # undef _CCCL_BUILTIN_LOG10L -#endif // _CCCL_CUDACC_BELOW(11, 7) +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) #if _CCCL_CHECK_BUILTIN(builtin_ilogb) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_ILOGBF(...) __builtin_ilogbf(__VA_ARGS__) @@ -398,7 +615,7 @@ # undef _CCCL_BUILTIN_LOG2F # undef _CCCL_BUILTIN_LOG2 # undef _CCCL_BUILTIN_LOG2L -#endif // _CCCL_CUDACC_BELOW(11, 7) +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) #if _CCCL_CHECK_BUILTIN(builtin_logb) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_LOGBF(...) __builtin_logbf(__VA_ARGS__) @@ -420,6 +637,74 @@ # define _CCCL_BUILTIN_OPERATOR_NEW(...) __builtin_operator_new(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) +#if _CCCL_CHECK_BUILTIN(builtin_pow) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_POWF(...) __builtin_powf(__VA_ARGS__) +# define _CCCL_BUILTIN_POW(...) __builtin_pow(__VA_ARGS__) +# define _CCCL_BUILTIN_POWL(...) __builtin_powl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_pow) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "pow" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_POWF +# undef _CCCL_BUILTIN_POW +# undef _CCCL_BUILTIN_POWL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_rint) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_RINTF(...) __builtin_rintf(__VA_ARGS__) +# define _CCCL_BUILTIN_RINT(...) __builtin_rint(__VA_ARGS__) +# define _CCCL_BUILTIN_RINTL(...) __builtin_rintl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_rint) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_RINTF +# undef _CCCL_BUILTIN_RINT +# undef _CCCL_BUILTIN_RINTL +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_round) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_ROUNDF(...) __builtin_roundf(__VA_ARGS__) +# define _CCCL_BUILTIN_ROUND(...) __builtin_round(__VA_ARGS__) +# define _CCCL_BUILTIN_ROUNDL(...) __builtin_roundl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_round) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_ROUNDF +# undef _CCCL_BUILTIN_ROUND +# undef _CCCL_BUILTIN_ROUNDL +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_scalbln) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_SCALBLNF(...) __builtin_scalblnf(__VA_ARGS__) +# define _CCCL_BUILTIN_SCALBLN(...) __builtin_scalbln(__VA_ARGS__) +# define _CCCL_BUILTIN_SCALBLNL(...) __builtin_scalblnl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_scalbln) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "scalblnf" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_SCALBLNF +# undef _CCCL_BUILTIN_SCALBLN +# undef _CCCL_BUILTIN_SCALBLNL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + +#if _CCCL_CHECK_BUILTIN(builtin_scalbn) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_SCALBNF(...) __builtin_scalbnf(__VA_ARGS__) +# define _CCCL_BUILTIN_SCALBN(...) __builtin_scalbn(__VA_ARGS__) +# define _CCCL_BUILTIN_SCALBNL(...) __builtin_scalbnl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_scalbn) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "scalbnf" +#if _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_SCALBNF +# undef _CCCL_BUILTIN_SCALBN +# undef _CCCL_BUILTIN_SCALBNL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER(CLANG) + #if _CCCL_CHECK_BUILTIN(builtin_signbit) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_SIGNBIT(...) __builtin_signbit(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_signbit) @@ -429,6 +714,32 @@ # undef _CCCL_BUILTIN_SIGNBIT #endif // _CCCL_CUDACC_BELOW(11, 7) +#if _CCCL_CHECK_BUILTIN(builtin_sqrt) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_SQRTF(...) __builtin_sqrtf(__VA_ARGS__) +# define _CCCL_BUILTIN_SQRT(...) __builtin_sqrt(__VA_ARGS__) +# define _CCCL_BUILTIN_SQRTL(...) __builtin_sqrtl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_sqrt) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_SQRTF +# undef _CCCL_BUILTIN_SQRT +# undef _CCCL_BUILTIN_SQRTL +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_trunc) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_TRUNCF(...) __builtin_truncf(__VA_ARGS__) +# define _CCCL_BUILTIN_TRUNC(...) __builtin_trunc(__VA_ARGS__) +# define _CCCL_BUILTIN_TRUNCL(...) __builtin_truncl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_trunc) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_TRUNCF +# undef _CCCL_BUILTIN_TRUNC +# undef _CCCL_BUILTIN_TRUNCL +#endif // _CCCL_CUDACC_BELOW(11, 7) + #if _CCCL_HAS_BUILTIN(__decay) && _CCCL_CUDA_COMPILER(CLANG) # define _CCCL_BUILTIN_DECAY(...) __decay(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__decay) && clang-cuda diff --git a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h index 5169ea4ad67..dee553633d8 100644 --- a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h +++ b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h @@ -39,4 +39,14 @@ # endif #endif // !_CCCL_HAS_NVBF16 +#if !defined(_CCCL_DISABLE_NVFP8_SUPPORT) +# if _CCCL_HAS_INCLUDE() && defined(_CCCL_HAS_NVFP16) && defined(_CCCL_HAS_NVBF16) +# define _CCCL_HAS_NVFP8() 1 +# else +# define _CCCL_HAS_NVFP8() 0 +# endif // _CCCL_HAS_INCLUDE() +#else +# define _CCCL_HAS_NVFP8() 0 +#endif // !defined(_CCCL_DISABLE_NVFP8_SUPPORT) + #endif // __CCCL_EXTENDED_FLOATING_POINT_H diff --git a/libcudacxx/include/cuda/std/__cmath/exponential_functions.h b/libcudacxx/include/cuda/std/__cmath/exponential_functions.h new file mode 100644 index 00000000000..f00f1807834 --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/exponential_functions.h @@ -0,0 +1,611 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___MATH_EXPONENTIAL_FUNCTIONS_H +#define _LIBCUDACXX___MATH_EXPONENTIAL_FUNCTIONS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// exp + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float exp(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXPF) + return _CCCL_BUILTIN_EXPF(__x); +#else // ^^^ _CCCL_BUILTIN_EXPF ^^^ // vvv !_CCCL_BUILTIN_EXPF vvv + return ::expf(__x); +#endif // !_CCCL_BUILTIN_EXPF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float expf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXPF) + return _CCCL_BUILTIN_EXPF(__x); +#else // ^^^ _CCCL_BUILTIN_EXPF ^^^ // vvv !_CCCL_BUILTIN_EXPF vvv + return ::expf(__x); +#endif // !_CCCL_BUILTIN_EXPF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXP) + return _CCCL_BUILTIN_EXP(__x); +#else // ^^^ _CCCL_BUILTIN_EXP ^^^ // vvv !_CCCL_BUILTIN_EXP vvv + return ::exp(__x); +#endif // !_CCCL_BUILTIN_EXP +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double exp(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_EXPL) + return _CCCL_BUILTIN_EXPL(__x); +# else // ^^^ _CCCL_BUILTIN_EXPL ^^^ // vvv !_CCCL_BUILTIN_EXPL vvv + return ::expl(__x); +# endif // !_CCCL_BUILTIN_EXPL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double expl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_EXPL) + return _CCCL_BUILTIN_EXPL(__x); +# else // ^^^ _CCCL_BUILTIN_EXPL ^^^ // vvv !_CCCL_BUILTIN_EXPL vvv + return ::expl(__x); +# endif // !_CCCL_BUILTIN_EXPL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half exp(__half __x) noexcept +{ + { + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (return ::hexp(__x);), ({ + float __xf = __half2float(__x); + __xf = ::expf(__xf); + __half_raw __ret_repr = ::__float2half_rn(__xf); + + uint16_t __repr = __half_raw(__x).x; + switch (__repr) + { + case 8057: + case 9679: + __ret_repr.x -= 1; + break; + + default:; + } + + return __ret_repr; + })) + } +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 exp(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hexp(__x);), (return __float2bfloat16(_CUDA_VSTD::expf(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp(_Integer __x) noexcept +{ + return _CUDA_VSTD::exp((double) __x); +} + +// frexp + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float frexp(float __x, int* __e) noexcept +{ +#if defined(_CCCL_BUILTIN_FREXPF) + return _CCCL_BUILTIN_FREXPF(__x, __e); +#else // ^^^ _CCCL_BUILTIN_FREXPF ^^^ // vvv !_CCCL_BUILTIN_FREXPF vvv + return ::frexpf(__x, __e); +#endif // !_CCCL_BUILTIN_FREXPF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float frexpf(float __x, int* __e) noexcept +{ +#if defined(_CCCL_BUILTIN_FREXPF) + return _CCCL_BUILTIN_FREXPF(__x, __e); +#else // ^^^ _CCCL_BUILTIN_FREXPF ^^^ // vvv !_CCCL_BUILTIN_FREXPF vvv + return ::frexpf(__x, __e); +#endif // !_CCCL_BUILTIN_FREXPF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double frexp(double __x, int* __e) noexcept +{ +#if defined(_CCCL_BUILTIN_FREXP) + return _CCCL_BUILTIN_FREXP(__x, __e); +#else // ^^^ _CCCL_BUILTIN_FREXP ^^^ // vvv !_CCCL_BUILTIN_FREXP vvv + return ::frexp(__x, __e); +#endif // !_CCCL_BUILTIN_FREXP +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double frexp(long double __x, int* __e) noexcept +{ +# if defined(_CCCL_BUILTIN_FREXPL) + return _CCCL_BUILTIN_FREXPL(__x, __e); +# else // ^^^ _CCCL_BUILTIN_FREXPL ^^^ // vvv !_CCCL_BUILTIN_FREXPL vvv + return ::frexpl(__x, __e); +# endif // !_CCCL_BUILTIN_FREXPL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double frexpl(long double __x, int* __e) noexcept +{ +# if defined(_CCCL_BUILTIN_FREXPL) + return _CCCL_BUILTIN_FREXPL(__x, __e); +# else // ^^^ _CCCL_BUILTIN_FREXPL ^^^ // vvv !_CCCL_BUILTIN_FREXPL vvv + return ::frexpl(__x, __e); +# endif // !_CCCL_BUILTIN_FREXPL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half frexp(__half __x, int* __e) noexcept +{ + return __float2half(_CUDA_VSTD::frexpf(__half2float(__x), __e)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 frexp(__nv_bfloat16 __x, int* __e) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::frexpf(__bfloat162float(__x), __e)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double frexp(_Integer __x, int* __e) noexcept +{ + return _CUDA_VSTD::frexp((double) __x, __e); +} + +// ldexp + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ldexp(float __x, int __e) noexcept +{ +#if defined(_CCCL_BUILTIN_LDEXPF) + return _CCCL_BUILTIN_LDEXPF(__x, __e); +#else // ^^^ _CCCL_BUILTIN_LDEXPF ^^^ // vvv !_CCCL_BUILTIN_LDEXPF vvv + return ::ldexpf(__x, __e); +#endif // !_CCCL_BUILTIN_LDEXPF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ldexpf(float __x, int __e) noexcept +{ +#if defined(_CCCL_BUILTIN_LDEXPF) + return _CCCL_BUILTIN_LDEXPF(__x, __e); +#else // ^^^ _CCCL_BUILTIN_LDEXPF ^^^ // vvv !_CCCL_BUILTIN_LDEXPF vvv + return ::ldexpf(__x, __e); +#endif // !_CCCL_BUILTIN_LDEXPF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ldexp(double __x, int __e) noexcept +{ +#if defined(_CCCL_BUILTIN_LDEXP) + return _CCCL_BUILTIN_LDEXP(__x, __e); +#else // ^^^ _CCCL_BUILTIN_LDEXP ^^^ // vvv !_CCCL_BUILTIN_LDEXP vvv + return ::ldexp(__x, __e); +#endif // !_CCCL_BUILTIN_LDEXP +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ldexp(long double __x, int __e) noexcept +{ +# if defined(_CCCL_BUILTIN_LDEXPL) + return _CCCL_BUILTIN_LDEXPL(__x, __e); +# else // ^^^ _CCCL_BUILTIN_LDEXPL ^^^ // vvv !_CCCL_BUILTIN_LDEXPL vvv + return ::ldexpl(__x, __e); +# endif // !_CCCL_BUILTIN_LDEXPL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ldexpl(long double __x, int __e) noexcept +{ +# if defined(_CCCL_BUILTIN_LDEXPL) + return _CCCL_BUILTIN_LDEXPL(__x, __e); +# else // ^^^ _CCCL_BUILTIN_LDEXPL ^^^ // vvv !_CCCL_BUILTIN_LDEXPL vvv + return ::ldexpl(__x, __e); +# endif // !_CCCL_BUILTIN_LDEXPL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half ldexp(__half __x, int __e) noexcept +{ + return __float2half(_CUDA_VSTD::ldexpf(__half2float(__x), __e)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 ldexp(__nv_bfloat16 __x, int __e) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::ldexpf(__bfloat162float(__x), __e)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ldexp(_Integer __x, int __e) noexcept +{ + return _CUDA_VSTD::ldexp((double) __x, __e); +} + +// exp2 + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float exp2(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXP2F) + return _CCCL_BUILTIN_EXP2F(__x); +#else // ^^^ _CCCL_BUILTIN_EXP2F ^^^ // vvv !_CCCL_BUILTIN_EXP2F vvv + return ::exp2f(__x); +#endif // !_CCCL_BUILTIN_EXP2F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float exp2f(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXP2F) + return _CCCL_BUILTIN_EXP2F(__x); +#else // ^^^ _CCCL_BUILTIN_EXP2F ^^^ // vvv !_CCCL_BUILTIN_EXP2F vvv + return ::exp2f(__x); +#endif // !_CCCL_BUILTIN_EXP2F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp2(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXP2) + return _CCCL_BUILTIN_EXP2(__x); +#else // ^^^ _CCCL_BUILTIN_EXP2 ^^^ // vvv !_CCCL_BUILTIN_EXP2 vvv + return ::exp2(__x); +#endif // !_CCCL_BUILTIN_EXP2 +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double exp2(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_EXP2L) + return _CCCL_BUILTIN_EXP2L(__x); +# else // ^^^ _CCCL_BUILTIN_EXP2L ^^^ // vvv !_CCCL_BUILTIN_EXP2L vvv + return ::exp2l(__x); +# endif // !_CCCL_BUILTIN_EXP2L +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double exp2l(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_EXP2L) + return _CCCL_BUILTIN_EXP2L(__x); +# else // ^^^ _CCCL_BUILTIN_EXP2L ^^^ // vvv !_CCCL_BUILTIN_EXP2L vvv + return ::exp2l(__x); +# endif // !_CCCL_BUILTIN_EXP2L +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half exp2(__half __x) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp2(__x);), (return __float2half(_CUDA_VSTD::exp2f(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 exp2(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hexp2(__x);), (return __float2bfloat16(_CUDA_VSTD::exp2f(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double exp2(_Integer __x) noexcept +{ + return _CUDA_VSTD::exp2((double) __x); +} + +// expm1 + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float expm1(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXPM1F) + return _CCCL_BUILTIN_EXPM1F(__x); +#else // ^^^ _CCCL_BUILTIN_EXPM1F ^^^ // vvv !_CCCL_BUILTIN_EXPM1F vvv + return ::expm1f(__x); +#endif // !_CCCL_BUILTIN_EXPM1F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float expm1f(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXPM1F) + return _CCCL_BUILTIN_EXPM1F(__x); +#else // ^^^ _CCCL_BUILTIN_EXPM1F ^^^ // vvv !_CCCL_BUILTIN_EXPM1F vvv + return ::expm1f(__x); +#endif // !_CCCL_BUILTIN_EXPM1F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double expm1(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_EXPM1) + return _CCCL_BUILTIN_EXPM1(__x); +#else // ^^^ _CCCL_BUILTIN_EXPM1 ^^^ // vvv !_CCCL_BUILTIN_EXPM1 vvv + return ::expm1(__x); +#endif // !_CCCL_BUILTIN_EXPM1 +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double expm1(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_EXPM1L) + return _CCCL_BUILTIN_EXPM1L(__x); +# else // ^^^ _CCCL_BUILTIN_EXPM1L ^^^ // vvv !_CCCL_BUILTIN_EXPM1L vvv + return ::expm1l(__x); +# endif // !_CCCL_BUILTIN_EXPM1L +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double expm1l(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_EXPM1L) + return _CCCL_BUILTIN_EXPM1L(__x); +# else // ^^^ _CCCL_BUILTIN_EXPM1L ^^^ // vvv !_CCCL_BUILTIN_EXPM1L vvv + return ::expm1l(__x); +# endif // !_CCCL_BUILTIN_EXPM1L +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half expm1(__half __x) noexcept +{ + return __float2half(_CUDA_VSTD::expm1f(__half2float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 expm1(__nv_bfloat16 __x) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::expm1f(__bfloat162float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double expm1(_Integer __x) noexcept +{ + return _CUDA_VSTD::expm1((double) __x); +} + +// scalbln + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalbln(float __x, long __y) noexcept +{ +#if defined(_CCCL_BUILTIN_SCALBLNF) + return _CCCL_BUILTIN_SCALBLNF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_SCALBLNF ^^^ // vvv !_CCCL_BUILTIN_SCALBLNF vvv + return ::scalblnf(__x, __y); +#endif // !_CCCL_BUILTIN_SCALBLNF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalblnf(float __x, long __y) noexcept +{ +#if defined(_CCCL_BUILTIN_SCALBLNF) + return _CCCL_BUILTIN_SCALBLNF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_SCALBLNF ^^^ // vvv !_CCCL_BUILTIN_SCALBLNF vvv + return ::scalblnf(__x, __y); +#endif // !_CCCL_BUILTIN_SCALBLNF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbln(double __x, long __y) noexcept +{ +#if defined(_CCCL_BUILTIN_SCALBLN) + return _CCCL_BUILTIN_SCALBLN(__x, __y); +#else // ^^^ _CCCL_BUILTIN_SCALBLN ^^^ // vvv !_CCCL_BUILTIN_SCALBLN vvv + return ::scalbln(__x, __y); +#endif // !_CCCL_BUILTIN_SCALBLN +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalbln(long double __x, long __y) noexcept +{ +# if defined(_CCCL_BUILTIN_SCALBLNL) + return _CCCL_BUILTIN_SCALBLNL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_SCALBLNL ^^^ // vvv !_CCCL_BUILTIN_SCALBLNL vvv + return ::scalblnl(__x, __y); +# endif // !_CCCL_BUILTIN_SCALBLNL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalblnl(long double __x, long __y) noexcept +{ +# if defined(_CCCL_BUILTIN_SCALBLNL) + return _CCCL_BUILTIN_SCALBLNL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_SCALBLNL ^^^ // vvv !_CCCL_BUILTIN_SCALBLNL vvv + return ::scalblnl(__x, __y); +# endif // !_CCCL_BUILTIN_SCALBLNL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half scalbln(__half __x, long __y) noexcept +{ + return __float2half(_CUDA_VSTD::scalblnf(__half2float(__x), __y)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 scalbln(__nv_bfloat16 __x, long __y) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::scalblnf(__bfloat162float(__x), __y)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbln(_Integer __x, long __y) noexcept +{ + return _CUDA_VSTD::scalbln((double) __x, __y); +} + +// scalbn + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalbn(float __x, int __y) noexcept +{ +#if defined(_CCCL_BUILTIN_SCALBNF) + return _CCCL_BUILTIN_SCALBNF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_SCALBNF ^^^ // vvv !_CCCL_BUILTIN_SCALBNF vvv + return ::scalbnf(__x, __y); +#endif // !_CCCL_BUILTIN_SCALBNF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float scalbnf(float __x, int __y) noexcept +{ +#if defined(_CCCL_BUILTIN_SCALBNF) + return _CCCL_BUILTIN_SCALBNF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_SCALBNF ^^^ // vvv !_CCCL_BUILTIN_SCALBNF vvv + return ::scalbnf(__x, __y); +#endif // !_CCCL_BUILTIN_SCALBNF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbn(double __x, int __y) noexcept +{ +#if defined(_CCCL_BUILTIN_SCALBN) + return _CCCL_BUILTIN_SCALBN(__x, __y); +#else // ^^^ _CCCL_BUILTIN_SCALBN ^^^ // vvv !_CCCL_BUILTIN_SCALBN vvv + return ::scalbn(__x, __y); +#endif // !_CCCL_BUILTIN_SCALBN +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalbn(long double __x, int __y) noexcept +{ +# if defined(_CCCL_BUILTIN_SCALBNL) + return _CCCL_BUILTIN_SCALBNL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_SCALBNL ^^^ // vvv !_CCCL_BUILTIN_SCALBNL vvv + return ::scalbnl(__x, __y); +# endif // !_CCCL_BUILTIN_SCALBNL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double scalbnl(long double __x, int __y) noexcept +{ +# if defined(_CCCL_BUILTIN_SCALBNL) + return _CCCL_BUILTIN_SCALBNL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_SCALBNL ^^^ // vvv !_CCCL_BUILTIN_SCALBNL vvv + return ::scalbnl(__x, __y); +# endif // !_CCCL_BUILTIN_SCALBNL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half scalbn(__half __x, int __y) noexcept +{ + return __float2half(_CUDA_VSTD::scalbnf(__half2float(__x), __y)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 scalbn(__nv_bfloat16 __x, int __y) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::scalbnf(__bfloat162float(__x), __y)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double scalbn(_Integer __x, int __y) noexcept +{ + return _CUDA_VSTD::scalbn((double) __x, __y); +} + +// pow + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float pow(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_POWF) + return _CCCL_BUILTIN_POWF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_POWF ^^^ // vvv !_CCCL_BUILTIN_POWF vvv + return ::powf(__x, __y); +#endif // !_CCCL_BUILTIN_POWF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float powf(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_POWF) + return _CCCL_BUILTIN_POWF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_POWF ^^^ // vvv !_CCCL_BUILTIN_POWF vvv + return ::powf(__x, __y); +#endif // !_CCCL_BUILTIN_POWF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double pow(double __x, double __y) noexcept +{ +#if defined(_CCCL_BUILTIN_POW) + return _CCCL_BUILTIN_POW(__x, __y); +#else // ^^^ _CCCL_BUILTIN_POW ^^^ // vvv !_CCCL_BUILTIN_POW vvv + return ::pow(__x, __y); +#endif // !_CCCL_BUILTIN_POW +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double pow(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_POWL) + return _CCCL_BUILTIN_POWL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_POWL ^^^ // vvv !_CCCL_BUILTIN_POWL vvv + return ::powl(__x, __y); +# endif // !_CCCL_BUILTIN_POWL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double powl(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_POWL) + return _CCCL_BUILTIN_POWL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_POWL ^^^ // vvv !_CCCL_BUILTIN_POWL vvv + return ::powl(__x, __y); +# endif // !_CCCL_BUILTIN_POWL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half pow(__half __x, __half __y) noexcept +{ + return __float2half(_CUDA_VSTD::powf(__half2float(__x), __half2float(__y))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 pow(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::powf(__bfloat162float(__x), __bfloat162float(__y))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> pow(_A1 __x, _A2 __y) noexcept +{ + using __result_type = __promote_t<_A1, _A2>; + static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), ""); + return _CUDA_VSTD::pow((__result_type) __x, (__result_type) __y); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___MATH_EXPONENTIAL_FUNCTIONS_H diff --git a/libcudacxx/include/cuda/std/__cmath/nvbf16.h b/libcudacxx/include/cuda/std/__cmath/nvbf16.h index 8f116968f8b..b0bda438e6e 100644 --- a/libcudacxx/include/cuda/std/__cmath/nvbf16.h +++ b/libcudacxx/include/cuda/std/__cmath/nvbf16.h @@ -55,11 +55,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 cosh(__nv_bfloat16 __v) return __float2bfloat16(::coshf(__bfloat162float(__v))); } -_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 exp(__nv_bfloat16 __v) -{ - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp(__v);), (return __float2bfloat16(::expf(__bfloat162float(__v)));)) -} - _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 hypot(__nv_bfloat16 __x, __nv_bfloat16 __y) { return __float2bfloat16(::hypotf(__bfloat162float(__x), __bfloat162float(__y))); @@ -70,11 +65,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 _ return __float2bfloat16(::atan2f(__bfloat162float(__x), __bfloat162float(__y))); } -_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sqrt(__nv_bfloat16 __x) -{ - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(::sqrtf(__bfloat162float(__x)));)) -} - // floating point helper _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept { diff --git a/libcudacxx/include/cuda/std/__cmath/nvfp16.h b/libcudacxx/include/cuda/std/__cmath/nvfp16.h index dbcaebbb4ef..1f295088aaf 100644 --- a/libcudacxx/include/cuda/std/__cmath/nvfp16.h +++ b/libcudacxx/include/cuda/std/__cmath/nvfp16.h @@ -97,34 +97,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __half cosh(__half __v) return __float2half(::coshf(__half2float(__v))); } -// clang-format off -_LIBCUDACXX_HIDE_FROM_ABI __half exp(__half __v) -{ - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, ( - return ::hexp(__v); - ), ( - { - float __vf = __half2float(__v); - __vf = ::expf(__vf); - __half_raw __ret_repr = ::__float2half_rn(__vf); - - uint16_t __repr = __half_raw(__v).x; - switch (__repr) - { - case 8057: - case 9679: - __ret_repr.x -= 1; - break; - - default:; - } - - return __ret_repr; - } - )) -} -// clang-format on - _LIBCUDACXX_HIDE_FROM_ABI __half hypot(__half __x, __half __y) { return __float2half(::hypotf(__half2float(__x), __half2float(__y))); @@ -135,11 +107,6 @@ _LIBCUDACXX_HIDE_FROM_ABI __half atan2(__half __x, __half __y) return __float2half(::atan2f(__half2float(__x), __half2float(__y))); } -_LIBCUDACXX_HIDE_FROM_ABI __half sqrt(__half __x) -{ - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(::sqrtf(__half2float(__x)));)) -} - // floating point helper _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_copysign(__half __x, __half __y) noexcept { diff --git a/libcudacxx/include/cuda/std/__cmath/roots.h b/libcudacxx/include/cuda/std/__cmath/roots.h new file mode 100644 index 00000000000..0d2065dcf5a --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/roots.h @@ -0,0 +1,171 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_ROOTS_H +#define _LIBCUDACXX___CMATH_ROOTS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// sqrt + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float sqrt(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_SQRTF) + return _CCCL_BUILTIN_SQRTF(__x); +#else // ^^^ _CCCL_BUILTIN_SQRTF ^^^ // vvv !_CCCL_BUILTIN_SQRTF vvv + return ::sqrtf(__x); +#endif // !_CCCL_BUILTIN_SQRTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float sqrtf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_SQRTF) + return _CCCL_BUILTIN_SQRTF(__x); +#else // ^^^ _CCCL_BUILTIN_SQRTF ^^^ // vvv !_CCCL_BUILTIN_SQRTF vvv + return ::sqrtf(__x); +#endif // !_CCCL_BUILTIN_SQRTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double sqrt(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_SQRT) + return _CCCL_BUILTIN_SQRT(__x); +#else // ^^^ _CCCL_BUILTIN_SQRT ^^^ // vvv !_CCCL_BUILTIN_SQRT vvv + return ::sqrt(__x); +#endif // !_CCCL_BUILTIN_SQRT +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double sqrt(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_SQRTL) + return _CCCL_BUILTIN_SQRTL(__x); +# else // ^^^ _CCCL_BUILTIN_SQRTL ^^^ // vvv !_CCCL_BUILTIN_SQRTL vvv + return ::sqrtl(__x); +# endif // !_CCCL_BUILTIN_SQRTL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double sqrtl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_SQRTL) + return _CCCL_BUILTIN_SQRTL(__x); +# else // ^^^ _CCCL_BUILTIN_SQRTL ^^^ // vvv !_CCCL_BUILTIN_SQRTL vvv + return ::sqrtl(__x); +# endif // !_CCCL_BUILTIN_SQRTL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half sqrt(__half __x) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(_CUDA_VSTD::sqrt(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sqrt(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(_CUDA_VSTD::sqrt(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double sqrt(_Integer __x) noexcept +{ + return _CUDA_VSTD::sqrt((double) __x); +} + +// cbrt + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float cbrt(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_CBRTF) + return _CCCL_BUILTIN_CBRTF(__x); +#else // ^^^ _CCCL_BUILTIN_CBRTF ^^^ // vvv !_CCCL_BUILTIN_CBRTF vvv + return ::cbrtf(__x); +#endif // !_CCCL_BUILTIN_CBRTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float cbrtf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_CBRTF) + return _CCCL_BUILTIN_CBRTF(__x); +#else // ^^^ _CCCL_BUILTIN_CBRTF ^^^ // vvv !_CCCL_BUILTIN_CBRTF vvv + return ::cbrtf(__x); +#endif // !_CCCL_BUILTIN_CBRTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double cbrt(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_CBRT) + return _CCCL_BUILTIN_CBRT(__x); +#else // ^^^ _CCCL_BUILTIN_CBRT ^^^ // vvv !_CCCL_BUILTIN_CBRT vvv + return ::cbrt(__x); +#endif // !_CCCL_BUILTIN_CBRT +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double cbrt(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_CBRTL) + return _CCCL_BUILTIN_CBRTL(__x); +# else // ^^^ _CCCL_BUILTIN_CBRTL ^^^ // vvv !_CCCL_BUILTIN_CBRTL vvv + return ::cbrtl(__x); +# endif // !_CCCL_BUILTIN_CBRTL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double cbrtl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_CBRTL) + return _CCCL_BUILTIN_CBRTL(__x); +# else // ^^^ _CCCL_BUILTIN_CBRTL ^^^ // vvv !_CCCL_BUILTIN_CBRTL vvv + return ::cbrtl(__x); +# endif // !_CCCL_BUILTIN_CBRTL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half cbrt(__half __x) noexcept +{ + return __float2half(_CUDA_VSTD::cbrt(__half2float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 cbrt(__nv_bfloat16 __x) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::cbrt(__bfloat162float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double cbrt(_Integer __x) noexcept +{ + return _CUDA_VSTD::cbrt((double) __x); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___CMATH_ROOTS_H diff --git a/libcudacxx/include/cuda/std/__cmath/rounding_functions.h b/libcudacxx/include/cuda/std/__cmath/rounding_functions.h new file mode 100644 index 00000000000..4404ce446c4 --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/rounding_functions.h @@ -0,0 +1,868 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_ROUNDING_FUNCTIONS_H +#define _LIBCUDACXX___CMATH_ROUNDING_FUNCTIONS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// ceil + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ceil(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_CEILF) + return _CCCL_BUILTIN_CEILF(__x); +#else // ^^^ _CCCL_BUILTIN_CEILF ^^^ // vvv !_CCCL_BUILTIN_CEILF vvv + return ::ceilf(__x); +#endif // !_CCCL_BUILTIN_CEILF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float ceilf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_CEILF) + return _CCCL_BUILTIN_CEILF(__x); +#else // ^^^ _CCCL_BUILTIN_CEILF ^^^ // vvv !_CCCL_BUILTIN_CEILF vvv + return ::ceilf(__x); +#endif // !_CCCL_BUILTIN_CEILF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ceil(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_CEIL) + return _CCCL_BUILTIN_CEIL(__x); +#else // ^^^ _CCCL_BUILTIN_CEIL ^^^ // vvv !_CCCL_BUILTIN_CEIL vvv + return ::ceil(__x); +#endif // !_CCCL_BUILTIN_CEIL +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ceil(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_CEILL) + return _CCCL_BUILTIN_CEILL(__x); +# else // ^^^ _CCCL_BUILTIN_CEILL ^^^ // vvv !_CCCL_BUILTIN_CEILL vvv + return ::ceill(__x); +# endif // !_CCCL_BUILTIN_CEILL +} +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double ceill(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_CEILL) + return _CCCL_BUILTIN_CEILL(__x); +# else // ^^^ _CCCL_BUILTIN_CEILL ^^^ // vvv !_CCCL_BUILTIN_CEILL vvv + return ::ceill(__x); +# endif // !_CCCL_BUILTIN_CEILL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half ceil(__half __x) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hceil(__x);), (return __float2half(_CUDA_VSTD::ceil(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 ceil(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hceil(__x);), (return __float2bfloat16(_CUDA_VSTD::ceil(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double ceil(_Integer __x) noexcept +{ + return _CUDA_VSTD::ceil((double) __x); +} + +// floor + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float floor(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_FLOORF) + return _CCCL_BUILTIN_FLOORF(__x); +#else // ^^^ _CCCL_BUILTIN_FLOORF ^^^ // vvv !_CCCL_BUILTIN_FLOORF vvv + return ::floorf(__x); +#endif // !_CCCL_BUILTIN_FLOORF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float floorf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_FLOORF) + return _CCCL_BUILTIN_FLOORF(__x); +#else // ^^^ _CCCL_BUILTIN_FLOORF ^^^ // vvv !_CCCL_BUILTIN_FLOORF vvv + return ::floorf(__x); +#endif // !_CCCL_BUILTIN_FLOORF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double floor(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_FLOOR) + return _CCCL_BUILTIN_FLOOR(__x); +#else // ^^^ _CCCL_BUILTIN_FLOOR ^^^ // vvv !_CCCL_BUILTIN_FLOOR vvv + return ::floor(__x); +#endif // !_CCCL_BUILTIN_FLOOR +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double floor(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_FLOORL) + return _CCCL_BUILTIN_FLOORL(__x); +# else // ^^^ _CCCL_BUILTIN_FLOORL ^^^ // vvv !_CCCL_BUILTIN_FLOORL vvv + return ::floorl(__x); +# endif // !_CCCL_BUILTIN_FLOORL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double floorl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_FLOORL) + return _CCCL_BUILTIN_FLOORL(__x); +# else // ^^^ _CCCL_BUILTIN_FLOORL ^^^ // vvv !_CCCL_BUILTIN_FLOORL vvv + return ::floorl(__x); +# endif // !_CCCL_BUILTIN_FLOORL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half floor(__half __x) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hfloor(__x);), (return __float2half(_CUDA_VSTD::floor(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 floor(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hfloor(__x);), (return __float2bfloat16(_CUDA_VSTD::floor(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double floor(_Integer __x) noexcept +{ + return _CUDA_VSTD::floor((double) __x); +} + +// llrint + +_LIBCUDACXX_HIDE_FROM_ABI long long llrint(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LLRINTF) + return _CCCL_BUILTIN_LLRINTF(__x); +#else // ^^^ _CCCL_BUILTIN_LLRINTF ^^^ // vvv !_CCCL_BUILTIN_LLRINTF vvv + return ::llrintf(__x); +#endif // !_CCCL_BUILTIN_LLRINTF +} + +_LIBCUDACXX_HIDE_FROM_ABI long long llrintf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LLRINTF) + return _CCCL_BUILTIN_LLRINTF(__x); +#else // ^^^ _CCCL_BUILTIN_LLRINTF ^^^ // vvv !_CCCL_BUILTIN_LLRINTF vvv + return ::llrintf(__x); +#endif // !_CCCL_BUILTIN_LLRINTF +} + +_LIBCUDACXX_HIDE_FROM_ABI long long llrint(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LLRINT) + return _CCCL_BUILTIN_LLRINT(__x); +#else // ^^^ _CCCL_BUILTIN_LLRINT ^^^ // vvv !_CCCL_BUILTIN_LLRINT vvv + return ::llrint(__x); +#endif // !_CCCL_BUILTIN_LLRINT +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_LIBCUDACXX_HIDE_FROM_ABI long long llrint(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LLRINTL) + return _CCCL_BUILTIN_LLRINTL(__x); +# else // ^^^ _CCCL_BUILTIN_LLRINTL ^^^ // vvv !_CCCL_BUILTIN_LLRINTL vvv + return ::llrintl(__x); +# endif // !_CCCL_BUILTIN_LLRINTL +} + +_LIBCUDACXX_HIDE_FROM_ABI long long llrintl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LLRINTL) + return _CCCL_BUILTIN_LLRINTL(__x); +# else // ^^^ _CCCL_BUILTIN_LLRINTL ^^^ // vvv !_CCCL_BUILTIN_LLRINTL vvv + return ::llrintl(__x); +# endif // !_CCCL_BUILTIN_LLRINTL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llrint(__half __x) noexcept +{ + return _CUDA_VSTD::llrintf(__half2float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llrint(__nv_bfloat16 __x) noexcept +{ + return _CUDA_VSTD::llrintf(__bfloat162float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_LIBCUDACXX_HIDE_FROM_ABI long long llrint(_Integer __x) noexcept +{ + return _CUDA_VSTD::llrint((double) __x); +} + +// llround + +_LIBCUDACXX_HIDE_FROM_ABI long long llround(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LLROUNDF) + return _CCCL_BUILTIN_LLROUNDF(__x); +#else // ^^^ _CCCL_BUILTIN_LLROUNDF ^^^ // vvv !_CCCL_BUILTIN_LLROUNDF vvv + return ::llroundf(__x); +#endif // !_CCCL_BUILTIN_LLROUNDF +} + +_LIBCUDACXX_HIDE_FROM_ABI long long llroundf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LLROUNDF) + return _CCCL_BUILTIN_LLROUNDF(__x); +#else // ^^^ _CCCL_BUILTIN_LLROUNDF ^^^ // vvv !_CCCL_BUILTIN_LLROUNDF vvv + return ::llroundf(__x); +#endif // !_CCCL_BUILTIN_LLROUNDF +} + +_LIBCUDACXX_HIDE_FROM_ABI long long llround(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LLROUND) + return _CCCL_BUILTIN_LLROUND(__x); +#else // ^^^ _CCCL_BUILTIN_LLROUND ^^^ // vvv !_CCCL_BUILTIN_LLROUND vvv + return ::llround(__x); +#endif // !_CCCL_BUILTIN_LLROUND +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_LIBCUDACXX_HIDE_FROM_ABI long long llround(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LLROUNDL) + return _CCCL_BUILTIN_LLROUNDL(__x); +# else // ^^^ _CCCL_BUILTIN_LLROUNDL ^^^ // vvv !_CCCL_BUILTIN_LLROUNDL vvv + return ::llroundl(__x); +# endif // !_CCCL_BUILTIN_LLROUNDL +} + +_LIBCUDACXX_HIDE_FROM_ABI long long llroundl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LLROUNDL) + return _CCCL_BUILTIN_LLROUNDL(__x); +# else // ^^^ _CCCL_BUILTIN_LLROUNDL ^^^ // vvv !_CCCL_BUILTIN_LLROUNDL vvv + return ::llroundl(__x); +# endif // !_CCCL_BUILTIN_LLROUNDL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llround(__half __x) noexcept +{ + return _CUDA_VSTD::llroundf(__half2float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long long llround(__nv_bfloat16 __x) noexcept +{ + return _CUDA_VSTD::llroundf(__bfloat162float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_LIBCUDACXX_HIDE_FROM_ABI long long llround(_Integer __x) noexcept +{ + return _CUDA_VSTD::llround((double) __x); +} + +// lrint + +_LIBCUDACXX_HIDE_FROM_ABI long lrint(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LRINTF) + return _CCCL_BUILTIN_LRINTF(__x); +#else // ^^^ _CCCL_BUILTIN_LRINTF ^^^ // vvv !_CCCL_BUILTIN_LRINTF vvv + return ::lrintf(__x); +#endif // !_CCCL_BUILTIN_LRINTF +} + +_LIBCUDACXX_HIDE_FROM_ABI long lrintf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LRINTF) + return _CCCL_BUILTIN_LRINTF(__x); +#else // ^^^ _CCCL_BUILTIN_LRINTF ^^^ // vvv !_CCCL_BUILTIN_LRINTF vvv + return ::lrintf(__x); +#endif // !_CCCL_BUILTIN_LRINTF +} + +_LIBCUDACXX_HIDE_FROM_ABI long lrint(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LRINT) + return _CCCL_BUILTIN_LRINT(__x); +#else // ^^^ _CCCL_BUILTIN_LRINT ^^^ // vvv !_CCCL_BUILTIN_LRINT vvv + return ::lrint(__x); +#endif // !_CCCL_BUILTIN_LRINT +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_LIBCUDACXX_HIDE_FROM_ABI long lrint(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LRINTL) + return _CCCL_BUILTIN_LRINTL(__x); +# else // ^^^ _CCCL_BUILTIN_LRINTL ^^^ // vvv !_CCCL_BUILTIN_LRINTL vvv + return ::lrintl(__x); +# endif // !_CCCL_BUILTIN_LRINTL +} + +_LIBCUDACXX_HIDE_FROM_ABI long lrintl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LRINTL) + return _CCCL_BUILTIN_LRINTL(__x); +# else // ^^^ _CCCL_BUILTIN_LRINTL ^^^ // vvv !_CCCL_BUILTIN_LRINTL vvv + return ::lrintl(__x); +# endif // !_CCCL_BUILTIN_LRINTL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lrint(__half __x) noexcept +{ + return _CUDA_VSTD::lrintf(__half2float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lrint(__nv_bfloat16 __x) noexcept +{ + return _CUDA_VSTD::lrintf(__bfloat162float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_LIBCUDACXX_HIDE_FROM_ABI long lrint(_Integer __x) noexcept +{ + return _CUDA_VSTD::lrint((double) __x); +} + +// lround + +_LIBCUDACXX_HIDE_FROM_ABI long lround(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LROUNDF) + return _CCCL_BUILTIN_LROUNDF(__x); +#else // ^^^ _CCCL_BUILTIN_LROUNDF ^^^ // vvv !_CCCL_BUILTIN_LROUNDF vvv + return ::lroundf(__x); +#endif // !_CCCL_BUILTIN_LROUNDF +} + +_LIBCUDACXX_HIDE_FROM_ABI long lroundf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LROUNDF) + return _CCCL_BUILTIN_LROUNDF(__x); +#else // ^^^ _CCCL_BUILTIN_LROUNDF ^^^ // vvv !_CCCL_BUILTIN_LROUNDF vvv + return ::lroundf(__x); +#endif // !_CCCL_BUILTIN_LROUNDF +} + +_LIBCUDACXX_HIDE_FROM_ABI long lround(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LROUND) + return _CCCL_BUILTIN_LROUND(__x); +#else // ^^^ _CCCL_BUILTIN_LROUND ^^^ // vvv !_CCCL_BUILTIN_LROUND vvv + return ::lround(__x); +#endif // !_CCCL_BUILTIN_LROUND +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_LIBCUDACXX_HIDE_FROM_ABI long lround(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LROUNDL) + return _CCCL_BUILTIN_LROUNDL(__x); +# else // ^^^ _CCCL_BUILTIN_LROUNDL ^^^ // vvv !_CCCL_BUILTIN_LROUNDL vvv + return ::lroundl(__x); +# endif // !_CCCL_BUILTIN_LROUNDL +} + +_LIBCUDACXX_HIDE_FROM_ABI long lroundl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LROUNDL) + return _CCCL_BUILTIN_LROUNDL(__x); +# else // ^^^ _CCCL_BUILTIN_LROUNDL ^^^ // vvv !_CCCL_BUILTIN_LROUNDL vvv + return ::lroundl(__x); +# endif // !_CCCL_BUILTIN_LROUNDL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lround(__half __x) noexcept +{ + return _CUDA_VSTD::lroundf(__half2float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long lround(__nv_bfloat16 __x) noexcept +{ + return _CUDA_VSTD::lroundf(__bfloat162float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_LIBCUDACXX_HIDE_FROM_ABI long lround(_Integer __x) noexcept +{ + return _CUDA_VSTD::lround((double) __x); +} + +// nearbyint + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float nearbyint(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_NEARBYINTF) + return _CCCL_BUILTIN_NEARBYINTF(__x); +#else // ^^^ _CCCL_BUILTIN_NEARBYINTF ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTF vvv + return ::nearbyintf(__x); +#endif // !_CCCL_BUILTIN_NEARBYINTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float nearbyintf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_NEARBYINTF) + return _CCCL_BUILTIN_NEARBYINTF(__x); +#else // ^^^ _CCCL_BUILTIN_NEARBYINTF ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTF vvv + return ::nearbyintf(__x); +#endif // !_CCCL_BUILTIN_NEARBYINTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double nearbyint(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_NEARBYINT) + return _CCCL_BUILTIN_NEARBYINT(__x); +#else // ^^^ _CCCL_BUILTIN_NEARBYINT ^^^ // vvv !_CCCL_BUILTIN_NEARBYINT vvv + return ::nearbyint(__x); +#endif // !_CCCL_BUILTIN_NEARBYINT +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double nearbyint(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_NEARBYINTL) + return _CCCL_BUILTIN_NEARBYINTL(__x); +# else // ^^^ _CCCL_BUILTIN_NEARBYINTL ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTL vvv + return ::nearbyintl(__x); +# endif // !_CCCL_BUILTIN_NEARBYINTL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double nearbyintl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_NEARBYINTL) + return _CCCL_BUILTIN_NEARBYINTL(__x); +# else // ^^^ _CCCL_BUILTIN_NEARBYINTL ^^^ // vvv !_CCCL_BUILTIN_NEARBYINTL vvv + return ::nearbyintl(__x); +# endif // !_CCCL_BUILTIN_NEARBYINTL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half nearbyint(__half __x) noexcept +{ + return __float2half(_CUDA_VSTD::nearbyintf(__half2float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 nearbyint(__nv_bfloat16 __x) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::nearbyintf(__bfloat162float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double nearbyint(_Integer __x) noexcept +{ + return _CUDA_VSTD::nearbyint((double) __x); +} + +// nextafter + +_LIBCUDACXX_HIDE_FROM_ABI float nextafter(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_NEXTAFTERF) + return _CCCL_BUILTIN_NEXTAFTERF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_NEXTAFTERF ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERF vvv + return ::nextafterf(__x, __y); +#endif // !_CCCL_BUILTIN_NEXTAFTERF +} + +_LIBCUDACXX_HIDE_FROM_ABI float nextafterf(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_NEXTAFTERF) + return _CCCL_BUILTIN_NEXTAFTERF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_NEXTAFTERF ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERF vvv + return ::nextafterf(__x, __y); +#endif // !_CCCL_BUILTIN_NEXTAFTERF +} + +_LIBCUDACXX_HIDE_FROM_ABI double nextafter(double __x, double __y) noexcept +{ +#if defined(_CCCL_BUILTIN_NEXTAFTER) + return _CCCL_BUILTIN_NEXTAFTER(__x, __y); +#else // ^^^ _CCCL_BUILTIN_NEXTAFTER ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTER vvv + return ::nextafter(__x, __y); +#endif // !_CCCL_BUILTIN_NEXTAFTER +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_LIBCUDACXX_HIDE_FROM_ABI long double nextafter(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_NEXTAFTERL) + return _CCCL_BUILTIN_NEXTAFTERL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_NEXTAFTERL ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERL vvv + return ::nextafterl(__x, __y); +# endif // !_CCCL_BUILTIN_NEXTAFTERL +} + +_LIBCUDACXX_HIDE_FROM_ABI long double nextafterl(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_NEXTAFTERL) + return _CCCL_BUILTIN_NEXTAFTERL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_NEXTAFTERL ^^^ // vvv !_CCCL_BUILTIN_NEXTAFTERL vvv + return ::nextafterl(__x, __y); +# endif // !_CCCL_BUILTIN_NEXTAFTERL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half nextafter(__half __x, __half __y) noexcept +{ + return __float2half(_CUDA_VSTD::nextafterf(__half2float(__x), __half2float(__y))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 nextafter(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::nextafterf(__bfloat162float(__x), __bfloat162float(__y))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> nextafter(_A1 __x, _A2 __y) noexcept +{ + using __result_type = __promote_t<_A1, _A2>; + static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), ""); + return _CUDA_VSTD::nextafter(static_cast<__result_type>(__x), static_cast<__result_type>(__y)); +} + +// nexttoward + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_LIBCUDACXX_HIDE_FROM_ABI float nexttoward(float __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_NEXTTOWARDF) + return _CCCL_BUILTIN_NEXTTOWARDF(__x, __y); +# else // ^^^ _CCCL_BUILTIN_NEXTTOWARDF ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDF vvv + return ::nexttowardf(__x, __y); +# endif // !_CCCL_BUILTIN_NEXTTOWARDF +} + +_LIBCUDACXX_HIDE_FROM_ABI float nexttowardf(float __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_NEXTTOWARDF) + return _CCCL_BUILTIN_NEXTTOWARDF(__x, __y); +# else // ^^^ _CCCL_BUILTIN_NEXTTOWARDF ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDF vvv + return ::nexttowardf(__x, __y); +# endif // !_CCCL_BUILTIN_NEXTTOWARDF +} + +_LIBCUDACXX_HIDE_FROM_ABI double nexttoward(double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_NEXTTOWARD) + return _CCCL_BUILTIN_NEXTTOWARD(__x, __y); +# else // ^^^ _CCCL_BUILTIN_NEXTTOWARD ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARD vvv + return ::nexttoward(__x, __y); +# endif // !_CCCL_BUILTIN_NEXTTOWARD +} + +_LIBCUDACXX_HIDE_FROM_ABI long double nexttoward(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_NEXTTOWARDL) + return _CCCL_BUILTIN_NEXTTOWARDL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_NEXTTOWARDL ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDL vvv + return ::nexttowardl(__x, __y); +# endif // !_CCCL_BUILTIN_NEXTTOWARDL +} + +_LIBCUDACXX_HIDE_FROM_ABI long double nexttowardl(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_NEXTTOWARDL) + return _CCCL_BUILTIN_NEXTTOWARDL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_NEXTTOWARDL ^^^ // vvv !_CCCL_BUILTIN_NEXTTOWARDL vvv + return ::nexttowardl(__x, __y); +# endif // !_CCCL_BUILTIN_NEXTTOWARDL +} + +# if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half nexttoward(__half __x, long double __y) noexcept +{ + return __float2half(_CUDA_VSTD::nexttowardf(__half2float(__x), __y)); +} +# endif // _LIBCUDACXX_HAS_NVFP16 + +# if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 nexttoward(__nv_bfloat16 __x, long double __y) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::nexttowardf(__bfloat162float(__x), __y)); +} +# endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_LIBCUDACXX_HIDE_FROM_ABI double nexttoward(_Integer __x, long double __y) noexcept +{ + return _CUDA_VSTD::nexttoward((double) __x, __y); +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +// rint + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float rint(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_RINTF) + return _CCCL_BUILTIN_RINTF(__x); +#else // ^^^ _CCCL_BUILTIN_RINTF ^^^ // vvv !_CCCL_BUILTIN_RINTF vvv + return ::rintf(__x); +#endif // !_CCCL_BUILTIN_RINTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float rintf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_RINTF) + return _CCCL_BUILTIN_RINTF(__x); +#else // ^^^ _CCCL_BUILTIN_RINTF ^^^ // vvv !_CCCL_BUILTIN_RINTF vvv + return ::rintf(__x); +#endif // !_CCCL_BUILTIN_RINTF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double rint(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_RINT) + return _CCCL_BUILTIN_RINT(__x); +#else // ^^^ _CCCL_BUILTIN_RINT ^^^ // vvv !_CCCL_BUILTIN_RINT vvv + return ::rint(__x); +#endif // !_CCCL_BUILTIN_RINT +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double rint(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_RINTL) + return _CCCL_BUILTIN_RINTL(__x); +# else // ^^^ _CCCL_BUILTIN_RINTL ^^^ // vvv !_CCCL_BUILTIN_RINTL vvv + return ::rintl(__x); +# endif // !_CCCL_BUILTIN_RINTL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double rintl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_RINTL) + return _CCCL_BUILTIN_RINTL(__x); +# else // ^^^ _CCCL_BUILTIN_RINTL ^^^ // vvv !_CCCL_BUILTIN_RINTL vvv + return ::rintl(__x); +# endif // !_CCCL_BUILTIN_RINTL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half rint(__half __x) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hrint(__x);), (return __float2half(_CUDA_VSTD::rint(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 rint(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hrint(__x);), (return __float2bfloat16(_CUDA_VSTD::rint(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double rint(_Integer __x) noexcept +{ + return _CUDA_VSTD::rint((double) __x); +} + +// round + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float round(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ROUNDF) + return _CCCL_BUILTIN_ROUNDF(__x); +#else // ^^^ _CCCL_BUILTIN_ROUNDF ^^^ // vvv !_CCCL_BUILTIN_ROUNDF vvv + return ::roundf(__x); +#endif // !_CCCL_BUILTIN_ROUNDF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float roundf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ROUNDF) + return _CCCL_BUILTIN_ROUNDF(__x); +#else // ^^^ _CCCL_BUILTIN_ROUNDF ^^^ // vvv !_CCCL_BUILTIN_ROUNDF vvv + return ::roundf(__x); +#endif // !_CCCL_BUILTIN_ROUNDF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double round(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ROUND) + return _CCCL_BUILTIN_ROUND(__x); +#else // ^^^ _CCCL_BUILTIN_ROUND ^^^ // vvv !_CCCL_BUILTIN_ROUND vvv + return ::round(__x); +#endif // !_CCCL_BUILTIN_ROUND +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double round(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_ROUNDL) + return _CCCL_BUILTIN_ROUNDL(__x); +# else // ^^^ _CCCL_BUILTIN_ROUNDL ^^^ // vvv !_CCCL_BUILTIN_ROUNDL vvv + return ::roundl(__x); +# endif // !_CCCL_BUILTIN_ROUNDL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double roundl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_ROUNDL) + return _CCCL_BUILTIN_ROUNDL(__x); +# else // ^^^ _CCCL_BUILTIN_ROUNDL ^^^ // vvv !_CCCL_BUILTIN_ROUNDL vvv + return ::roundl(__x); +# endif // !_CCCL_BUILTIN_ROUNDL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half round(__half __x) noexcept +{ + return __float2half(_CUDA_VSTD::roundf(__half2float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 round(__nv_bfloat16 __x) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::roundf(__bfloat162float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double round(_Integer __x) noexcept +{ + return _CUDA_VSTD::round((double) __x); +} + +// trunc + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float trunc(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_TRUNCF) + return _CCCL_BUILTIN_TRUNCF(__x); +#else // ^^^ _CCCL_BUILTIN_TRUNCF ^^^ // vvv !_CCCL_BUILTIN_TRUNCF vvv + return ::truncf(__x); +#endif // !_CCCL_BUILTIN_TRUNCF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float truncf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_TRUNCF) + return _CCCL_BUILTIN_TRUNCF(__x); +#else // ^^^ _CCCL_BUILTIN_TRUNCF ^^^ // vvv !_CCCL_BUILTIN_TRUNCF vvv + return ::truncf(__x); +#endif // !_CCCL_BUILTIN_TRUNCF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double trunc(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_TRUNC) + return _CCCL_BUILTIN_TRUNC(__x); +#else // ^^^ _CCCL_BUILTIN_TRUNC ^^^ // vvv !_CCCL_BUILTIN_TRUNC vvv + return ::trunc(__x); +#endif // !_CCCL_BUILTIN_TRUNC +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double trunc(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_TRUNCL) + return _CCCL_BUILTIN_TRUNCL(__x); +# else // ^^^ _CCCL_BUILTIN_TRUNCL ^^^ // vvv !_CCCL_BUILTIN_TRUNCL vvv + return ::truncl(__x); +# endif // !_CCCL_BUILTIN_TRUNCL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double truncl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_TRUNCL) + return _CCCL_BUILTIN_TRUNCL(__x); +# else // ^^^ _CCCL_BUILTIN_TRUNCL ^^^ // vvv !_CCCL_BUILTIN_TRUNCL vvv + return ::truncl(__x); +# endif // !_CCCL_BUILTIN_TRUNCL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half trunc(__half __x) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::htrunc(__x);), (return __float2half(_CUDA_VSTD::trunc(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 trunc(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::htrunc(__x);), (return __float2bfloat16(_CUDA_VSTD::trunc(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double trunc(_Integer __x) noexcept +{ + return _CUDA_VSTD::trunc((double) __x); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___CMATH_ROUNDING_FUNCTIONS_H diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h index 1282b47f6d9..99fcde51002 100644 --- a/libcudacxx/include/cuda/std/__complex/nvbf16.h +++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h @@ -85,8 +85,8 @@ struct __type_to_vector<__nv_bfloat16> template <> struct __cccl_complex_overload_traits<__nv_bfloat16, false, false> { - typedef __nv_bfloat16 _ValueType; - typedef complex<__nv_bfloat16> _ComplexType; + using _ValueType = __nv_bfloat16; + using _ComplexType = complex<__nv_bfloat16>; }; template diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h index bc2da05d61d..7e51a81d8cb 100644 --- a/libcudacxx/include/cuda/std/__complex/nvfp16.h +++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h @@ -82,8 +82,8 @@ struct __type_to_vector<__half> template <> struct __cccl_complex_overload_traits<__half, false, false> { - typedef __half _ValueType; - typedef complex<__half> _ComplexType; + using _ValueType = __half; + using _ComplexType = complex<__half>; }; template diff --git a/libcudacxx/include/cuda/std/__cuda/cstdint_prelude.h b/libcudacxx/include/cuda/std/__cuda/cstdint_prelude.h deleted file mode 100644 index 5111e9dd82a..00000000000 --- a/libcudacxx/include/cuda/std/__cuda/cstdint_prelude.h +++ /dev/null @@ -1,90 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX___CUDA_CSTDINT_PRELUDE_H -#define _LIBCUDACXX___CUDA_CSTDINT_PRELUDE_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if !_CCCL_COMPILER(NVRTC) -# include -#else // ^^^ !_CCCL_COMPILER(NVRTC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv -typedef signed char int8_t; -typedef unsigned char uint8_t; -typedef signed short int16_t; -typedef unsigned short uint16_t; -typedef signed int int32_t; -typedef unsigned int uint32_t; -typedef signed long long int64_t; -typedef unsigned long long uint64_t; - -# define _LIBCUDACXX_ADDITIONAL_INTS(N) \ - typedef int##N##_t int_fast##N##_t; \ - typedef uint##N##_t uint_fast##N##_t; \ - typedef int##N##_t int_least##N##_t; \ - typedef uint##N##_t uint_least##N##_t - -_LIBCUDACXX_ADDITIONAL_INTS(8); -_LIBCUDACXX_ADDITIONAL_INTS(16); -_LIBCUDACXX_ADDITIONAL_INTS(32); -_LIBCUDACXX_ADDITIONAL_INTS(64); -# undef _LIBCUDACXX_ADDITIONAL_INTS - -typedef int64_t intptr_t; -typedef uint64_t uintptr_t; -typedef int64_t intmax_t; -typedef uint64_t uintmax_t; - -# define INT8_MIN SCHAR_MIN -# define INT16_MIN SHRT_MIN -# define INT32_MIN INT_MIN -# define INT64_MIN LLONG_MIN -# define INT8_MAX SCHAR_MAX -# define INT16_MAX SHRT_MAX -# define INT32_MAX INT_MAX -# define INT64_MAX LLONG_MAX -# define UINT8_MAX UCHAR_MAX -# define UINT16_MAX USHRT_MAX -# define UINT32_MAX UINT_MAX -# define UINT64_MAX ULLONG_MAX -# define INT_FAST8_MIN SCHAR_MIN -# define INT_FAST16_MIN SHRT_MIN -# define INT_FAST32_MIN INT_MIN -# define INT_FAST64_MIN LLONG_MIN -# define INT_FAST8_MAX SCHAR_MAX -# define INT_FAST16_MAX SHRT_MAX -# define INT_FAST32_MAX INT_MAX -# define INT_FAST64_MAX LLONG_MAX -# define UINT_FAST8_MAX UCHAR_MAX -# define UINT_FAST16_MAX USHRT_MAX -# define UINT_FAST32_MAX UINT_MAX -# define UINT_FAST64_MAX ULLONG_MAX - -# define INT8_C(X) ((int_least8_t) (X)) -# define INT16_C(X) ((int_least16_t) (X)) -# define INT32_C(X) ((int_least32_t) (X)) -# define INT64_C(X) ((int_least64_t) (X)) -# define UINT8_C(X) ((uint_least8_t) (X)) -# define UINT16_C(X) ((uint_least16_t) (X)) -# define UINT32_C(X) ((uint_least32_t) (X)) -# define UINT64_C(X) ((uint_least64_t) (X)) -# define INTMAX_C(X) ((intmax_t) (X)) -# define UINTMAX_C(X) ((uintmax_t) (X)) -#endif // _CCCL_COMPILER(NVRTC) - -#endif // _LIBCUDACXX___CUDA_CSTDINT_PRELUDE_H diff --git a/libcudacxx/include/cuda/std/__exception/cuda_error.h b/libcudacxx/include/cuda/std/__exception/cuda_error.h index 40af7d6c3e6..fdc32cf0571 100644 --- a/libcudacxx/include/cuda/std/__exception/cuda_error.h +++ b/libcudacxx/include/cuda/std/__exception/cuda_error.h @@ -22,10 +22,6 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILER(CLANG) -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) - #include #if !_CCCL_COMPILER(NVRTC) @@ -40,8 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA /** * @brief Exception thrown when a CUDA error is encountered. */ -#if _CCCL_HAS_CUDA_COMPILER -# ifndef _CCCL_NO_EXCEPTIONS +#ifndef _CCCL_NO_EXCEPTIONS class cuda_error : public ::std::runtime_error { private: @@ -50,37 +45,36 @@ class cuda_error : public ::std::runtime_error char __buffer[256]; }; - static char* __format_cuda_error(::cudaError_t __status, const char* __msg, char* __msg_buffer) noexcept + static char* __format_cuda_error(const int __status, const char* __msg, char* __msg_buffer) noexcept { ::snprintf(__msg_buffer, 256, "cudaError %d: %s", __status, __msg); return __msg_buffer; } public: - cuda_error(::cudaError_t __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept + cuda_error(const int __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept : ::std::runtime_error(__format_cuda_error(__status, __msg, __msg_buffer.__buffer)) {} }; -_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t __status, const char* __msg) +_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int __status, const char* __msg) { NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::cuda::cuda_error(__status, __msg);), ((void) __status; (void) __msg; _CUDA_VSTD_NOVERSION::terminate();)) } -# else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv +#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv class cuda_error { public: - _LIBCUDACXX_HIDE_FROM_ABI cuda_error(::cudaError_t, const char*) noexcept {} + _LIBCUDACXX_HIDE_FROM_ABI cuda_error(const int, const char*) noexcept {} }; -_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t, const char*) +_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(const int, const char*) { _CUDA_VSTD_NOVERSION::terminate(); } -# endif // _CCCL_NO_EXCEPTIONS -#endif // _CCCL_CUDA_COMPILER +#endif // _CCCL_NO_EXCEPTIONS _LIBCUDACXX_END_NAMESPACE_CUDA diff --git a/libcudacxx/include/cuda/std/__exception/terminate.h b/libcudacxx/include/cuda/std/__exception/terminate.h index a65722bac64..c5dd9a7e6cb 100644 --- a/libcudacxx/include/cuda/std/__exception/terminate.h +++ b/libcudacxx/include/cuda/std/__exception/terminate.h @@ -37,7 +37,7 @@ _CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __cccl_terminate() noexcept #if 0 // Expose once atomic is universally available -typedef void (*terminate_handler)(); +using terminate_handler = void (*)(); # ifdef __CUDA_ARCH__ __device__ diff --git a/libcudacxx/include/cuda/std/__functional/binary_function.h b/libcudacxx/include/cuda/std/__functional/binary_function.h index af7230678b7..5b400088e4c 100644 --- a/libcudacxx/include/cuda/std/__functional/binary_function.h +++ b/libcudacxx/include/cuda/std/__functional/binary_function.h @@ -27,9 +27,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 binary_function { - typedef _Arg1 first_argument_type; - typedef _Arg2 second_argument_type; - typedef _Result result_type; + using first_argument_type = _Arg1; + using second_argument_type = _Arg2; + using result_type = _Result; }; #endif // _CCCL_STD_VER <= 2014 || defined(_LIBCUDACXX_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION) diff --git a/libcudacxx/include/cuda/std/__functional/bind.h b/libcudacxx/include/cuda/std/__functional/bind.h index 0c1beac45c9..7542191c0d8 100644 --- a/libcudacxx/include/cuda/std/__functional/bind.h +++ b/libcudacxx/include/cuda/std/__functional/bind.h @@ -122,7 +122,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI enable_if_t::value, __invoke_of<_Ti&, _Uj...>> __mu(_Ti& __ti, tuple<_Uj...>& __uj) { - typedef __make_tuple_indices_t __indices; + using __indices = __make_tuple_indices_t; return _CUDA_VSTD::__mu_expand(__ti, __uj, __indices()); } @@ -133,7 +133,7 @@ struct __mu_return2 template struct __mu_return2 { - typedef __tuple_element_t::value - 1, _Uj> type; + using type = __tuple_element_t::value - 1, _Uj>; }; template @@ -160,13 +160,13 @@ struct __mu_return_impl; template struct __mu_return_invokable // false { - typedef __nat type; + using type = __nat; }; template struct __mu_return_invokable { - typedef typename __invoke_of<_Ti&, _Uj...>::type type; + using type = typename __invoke_of<_Ti&, _Uj...>::type; }; template @@ -177,19 +177,19 @@ struct __mu_return_impl<_Ti, false, true, false, tuple<_Uj...>> template struct __mu_return_impl<_Ti, false, false, true, _TupleUj> { - typedef __tuple_element_t::value - 1, _TupleUj>&& type; + using type = __tuple_element_t::value - 1, _TupleUj>&&; }; template struct __mu_return_impl<_Ti, true, false, false, _TupleUj> { - typedef typename _Ti::type& type; + using type = typename _Ti::type&; }; template struct __mu_return_impl<_Ti, false, false, false, _TupleUj> { - typedef _Ti& type; + using type = _Ti&; }; template @@ -226,13 +226,13 @@ struct __bind_return; template struct __bind_return<_Fp, tuple<_BoundArgs...>, _TupleUj, true> { - typedef typename __invoke_of<_Fp&, typename __mu_return<_BoundArgs, _TupleUj>::type...>::type type; + using type = typename __invoke_of<_Fp&, typename __mu_return<_BoundArgs, _TupleUj>::type...>::type; }; template struct __bind_return<_Fp, const tuple<_BoundArgs...>, _TupleUj, true> { - typedef typename __invoke_of<_Fp&, typename __mu_return::type...>::type type; + using type = typename __invoke_of<_Fp&, typename __mu_return::type...>::type; }; template @@ -249,14 +249,14 @@ template class __bind : public __weak_result_type> { protected: - typedef decay_t<_Fp> _Fd; - typedef tuple...> _Td; + using _Fd = decay_t<_Fp>; + using _Td = tuple...>; private: _Fd __f_; _Td __bound_args_; - typedef __make_tuple_indices_t __indices; + using __indices = __make_tuple_indices_t; public: template > : public true_type template class __bind_r : public __bind<_Fp, _BoundArgs...> { - typedef __bind<_Fp, _BoundArgs...> base; - typedef typename base::_Fd _Fd; - typedef typename base::_Td _Td; + using base = __bind<_Fp, _BoundArgs...>; + using _Fd = typename base::_Fd; + using _Td = typename base::_Td; public: - typedef _Rp result_type; + using result_type = _Rp; template result_type> operator()(_Args&&... __args) { - typedef __invoke_void_return_wrapper<_Rp> _Invoker; + using _Invoker = __invoke_void_return_wrapper<_Rp>; return _Invoker::__call(static_cast(*this), _CUDA_VSTD::forward<_Args>(__args)...); } @@ -321,7 +321,7 @@ class __bind_r : public __bind<_Fp, _BoundArgs...> result_type> operator()(_Args&&... __args) const { - typedef __invoke_void_return_wrapper<_Rp> _Invoker; + using _Invoker = __invoke_void_return_wrapper<_Rp>; return _Invoker::__call(static_cast(*this), _CUDA_VSTD::forward<_Args>(__args)...); } }; @@ -333,7 +333,7 @@ struct is_bind_expression<__bind_r<_Rp, _Fp, _BoundArgs...>> : public true_type template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind<_Fp, _BoundArgs...> bind(_Fp&& __f, _BoundArgs&&... __bound_args) { - typedef __bind<_Fp, _BoundArgs...> type; + using type = __bind<_Fp, _BoundArgs...>; return type(_CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_BoundArgs>(__bound_args)...); } @@ -341,7 +341,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind_r<_Rp, _Fp, _BoundArgs...> bind(_Fp&& __f, _BoundArgs&&... __bound_args) { - typedef __bind_r<_Rp, _Fp, _BoundArgs...> type; + using type = __bind_r<_Rp, _Fp, _BoundArgs...>; return type(_CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_BoundArgs>(__bound_args)...); } diff --git a/libcudacxx/include/cuda/std/__functional/function.h b/libcudacxx/include/cuda/std/__functional/function.h index e2ec912e6fb..6544f572c81 100644 --- a/libcudacxx/include/cuda/std/__functional/function.h +++ b/libcudacxx/include/cuda/std/__functional/function.h @@ -136,8 +136,8 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> __compressed_pair<_Fp, _Ap> __f_; public: - typedef _CCCL_NODEBUG_ALIAS _Fp _Target; - typedef _CCCL_NODEBUG_ALIAS _Ap _Alloc; + using _Target _CCCL_NODEBUG_ALIAS = _Fp; + using _Alloc _CCCL_NODEBUG_ALIAS = _Ap; _LIBCUDACXX_HIDE_FROM_ABI const _Target& __target() const { @@ -170,16 +170,16 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg) { - typedef __invoke_void_return_wrapper<_Rp> _Invoker; + using _Invoker = __invoke_void_return_wrapper<_Rp>; return _Invoker::__call(__f_.first(), _CUDA_VSTD::forward<_ArgTypes>(__arg)...); } _LIBCUDACXX_HIDE_FROM_ABI __alloc_func* __clone() const { - typedef allocator_traits<_Alloc> __alloc_traits; - typedef typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type _AA; + using __alloc_traits = allocator_traits<_Alloc>; + using _AA = typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type; _AA __a(__f_.second()); - typedef __allocator_destructor<_AA> _Dp; + using _Dp = __allocator_destructor<_AA>; unique_ptr<__alloc_func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); ::new ((void*) __hold.get()) __alloc_func(__f_.first(), _Alloc(__a)); return __hold.release(); @@ -192,8 +192,8 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> static void __destroy_and_delete(__alloc_func* __f) { - typedef allocator_traits<_Alloc> __alloc_traits; - typedef typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type _FunAlloc; + using __alloc_traits = allocator_traits<_Alloc>; + using _FunAlloc = typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type; _FunAlloc __a(__f->__get_allocator()); __f->destroy(); __a.deallocate(__f, 1); @@ -206,7 +206,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _Fp __f_; public: - typedef _CCCL_NODEBUG_ALIAS _Fp _Target; + using _Target _CCCL_NODEBUG_ALIAS = _Fp; _LIBCUDACXX_HIDE_FROM_ABI const _Target& __target() const { @@ -223,7 +223,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg) { - typedef __invoke_void_return_wrapper<_Rp> _Invoker; + using _Invoker = __invoke_void_return_wrapper<_Rp>; return _Invoker::__call(__f_, _CUDA_VSTD::forward<_ArgTypes>(__arg)...); } @@ -313,10 +313,10 @@ class __func<_Fp, _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> template __base<_Rp(_ArgTypes...)>* __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__clone() const { - typedef allocator_traits<_Alloc> __alloc_traits; - typedef typename __rebind_alloc_helper<__alloc_traits, __func>::type _Ap; + using __alloc_traits = allocator_traits<_Alloc>; + using _Ap = typename __rebind_alloc_helper<__alloc_traits, __func>::type; _Ap __a(__f_.__get_allocator()); - typedef __allocator_destructor<_Ap> _Dp; + using _Dp = __allocator_destructor<_Ap>; unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); ::new ((void*) __hold.get()) __func(__f_.__target(), _Alloc(__a)); return __hold.release(); @@ -337,8 +337,8 @@ void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy() noexcept template void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy_deallocate() noexcept { - typedef allocator_traits<_Alloc> __alloc_traits; - typedef typename __rebind_alloc_helper<__alloc_traits, __func>::type _Ap; + using __alloc_traits = allocator_traits<_Alloc>; + using _Ap = typename __rebind_alloc_helper<__alloc_traits, __func>::type; _Ap __a(__f_.__get_allocator()); __f_.destroy(); __a.deallocate(this, 1); @@ -380,7 +380,7 @@ class __value_func<_Rp(_ArgTypes...)> { typename aligned_storage<3 * sizeof(void*)>::type __buf_; - typedef __base<_Rp(_ArgTypes...)> __func; + using __func = __base<_Rp(_ArgTypes...)>; __func* __f_; _LIBCUDACXX_NO_CFI static __func* __as_base(void* __p) @@ -397,9 +397,9 @@ class __value_func<_Rp(_ArgTypes...)> _LIBCUDACXX_HIDE_FROM_ABI __value_func(_Fp&& __f, const _Alloc& __a) : __f_(nullptr) { - typedef allocator_traits<_Alloc> __alloc_traits; - typedef __function::__func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun; - typedef typename __rebind_alloc_helper<__alloc_traits, _Fun>::type _FunAlloc; + using __alloc_traits = allocator_traits<_Alloc>; + using _Fun = __function::__func<_Fp, _Alloc, _Rp(_ArgTypes...)>; + using _FunAlloc = typename __rebind_alloc_helper<__alloc_traits, _Fun>::type; if (__function::__not_null(__f)) { @@ -411,7 +411,7 @@ class __value_func<_Rp(_ArgTypes...)> } else { - typedef __allocator_destructor<_FunAlloc> _Dp; + using _Dp = __allocator_destructor<_FunAlloc>; unique_ptr<__func, _Dp> __hold(__af.allocate(1), _Dp(__af, 1)); ::new ((void*) __hold.get()) _Fun(_CUDA_VSTD::move(__f), _Alloc(__a)); __f_ = __hold.release(); @@ -697,7 +697,7 @@ struct __policy_invoker; template struct __policy_invoker<_Rp(_ArgTypes...)> { - typedef _Rp (*__Call)(const __policy_storage*, __fast_forward<_ArgTypes>...); + using __Call = _Rp (*)(const __policy_storage*, __fast_forward<_ArgTypes>...); __Call __call_; @@ -746,7 +746,7 @@ class __policy_func<_Rp(_ArgTypes...)> // Calls the value stored in __buf_. This could technically be part of // policy, but storing it here eliminates a level of indirection inside // operator(). - typedef __function::__policy_invoker<_Rp(_ArgTypes...)> __invoker; + using __invoker = __function::__policy_invoker<_Rp(_ArgTypes...)>; __invoker __invoker_; // The policy that describes how to move / copy / destroy __buf_. Never @@ -762,9 +762,9 @@ class __policy_func<_Rp(_ArgTypes...)> _LIBCUDACXX_HIDE_FROM_ABI __policy_func(_Fp&& __f, const _Alloc& __a) : __policy_(__policy::__create_empty()) { - typedef __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun; - typedef allocator_traits<_Alloc> __alloc_traits; - typedef typename __rebind_alloc_helper<__alloc_traits, _Fun>::type _FunAlloc; + using _Fun = __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)>; + using __alloc_traits = allocator_traits<_Alloc>; + using _FunAlloc = typename __rebind_alloc_helper<__alloc_traits, _Fun>::type; if (__function::__not_null(__f)) { @@ -778,7 +778,7 @@ class __policy_func<_Rp(_ArgTypes...)> } else { - typedef __allocator_destructor<_FunAlloc> _Dp; + using _Dp = __allocator_destructor<_FunAlloc>; unique_ptr<_Fun, _Dp> __hold(__af.allocate(1), _Dp(__af, 1)); ::new ((void*) __hold.get()) _Fun(_CUDA_VSTD::move(__f), _Alloc(__af)); __buf_.__large = __hold.release(); @@ -790,7 +790,7 @@ class __policy_func<_Rp(_ArgTypes...)> _LIBCUDACXX_HIDE_FROM_ABI explicit __policy_func(_Fp&& __f) : __policy_(__policy::__create_empty()) { - typedef __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _Fun; + using _Fun = __default_alloc_func<_Fp, _Rp(_ArgTypes...)>; if (__function::__not_null(__f)) { @@ -913,7 +913,7 @@ extern "C" void _Block_release(const void*); template class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> { - typedef _Rp1 (^__block_type)(_ArgTypes1...); + using ...); = _Rp1 (^__block_type)(_ArgTypes1 __block_type __f_; public: @@ -989,7 +989,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT function<_Rp(_ArgTypes...)> : public __function::__maybe_derive_from_unary_function<_Rp(_ArgTypes...)> , public __function::__maybe_derive_from_binary_function<_Rp(_ArgTypes...)> { - typedef __function::__policy_func<_Rp(_ArgTypes...)> __func; + using __func = __function::__policy_func<_Rp(_ArgTypes...)>; __func __f_; @@ -1011,7 +1011,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT function<_Rp(_ArgTypes...)> using _EnableIfLValueCallable = enable_if_t<__callable<_Fp&>::value>; public: - typedef _Rp result_type; + using result_type = _Rp; // construct/copy/destroy: _LIBCUDACXX_HIDE_FROM_ABI function() noexcept {} diff --git a/libcudacxx/include/cuda/std/__functional/hash.h b/libcudacxx/include/cuda/std/__functional/hash.h index 5e3559663c0..d40f393e9b1 100644 --- a/libcudacxx/include/cuda/std/__functional/hash.h +++ b/libcudacxx/include/cuda/std/__functional/hash.h @@ -35,7 +35,7 @@ #include #include #include -#include +#include #ifndef __cuda_std__ @@ -45,7 +45,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI _Size __loadword(const void* __p) { _Size __r; - std::memcpy(&__r, __p, sizeof(__r)); + _CUDA_VSTD::memcpy(&__r, __p, sizeof(__r)); return __r; } @@ -374,7 +374,7 @@ struct _PairT _LIBCUDACXX_HIDE_FROM_ABI size_t __hash_combine(size_t __lhs, size_t __rhs) noexcept { - typedef __scalar_hash<_PairT> _HashT; + using _HashT = __scalar_hash<_PairT>; const _PairT __p = {__lhs, __rhs}; return _HashT()(__p); } @@ -618,7 +618,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __enum_hash : public __unary_function<_Tp, { _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept { - typedef typename underlying_type<_Tp>::type type; + using type = typename underlying_type<_Tp>::type; return hash()(static_cast(__v)); } }; diff --git a/libcudacxx/include/cuda/std/__functional/invoke.h b/libcudacxx/include/cuda/std/__functional/invoke.h index f3072249fb8..e60e3b0b363 100644 --- a/libcudacxx/include/cuda/std/__functional/invoke.h +++ b/libcudacxx/include/cuda/std/__functional/invoke.h @@ -56,200 +56,200 @@ struct __member_pointer_traits_imp template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...), true, false> { - typedef _Class _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...), true, false> { - typedef _Class _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const, true, false> { - typedef _Class const _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class const; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const, true, false> { - typedef _Class const _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class const; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile, true, false> { - typedef _Class volatile _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class volatile; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile, true, false> { - typedef _Class volatile _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class volatile; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile, true, false> { - typedef _Class const volatile _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class const volatile; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile, true, false> { - typedef _Class const volatile _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class const volatile; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...)&, true, false> { - typedef _Class& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...)&, true, false> { - typedef _Class& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const&, true, false> { - typedef _Class const& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class const&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const&, true, false> { - typedef _Class const& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class const&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile&, true, false> { - typedef _Class volatile& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class volatile&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile&, true, false> { - typedef _Class volatile& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class volatile&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile&, true, false> { - typedef _Class const volatile& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class const volatile&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile&, true, false> { - typedef _Class const volatile& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class const volatile&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...)&&, true, false> { - typedef _Class&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...)&&, true, false> { - typedef _Class&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const&&, true, false> { - typedef _Class const&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class const&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const&&, true, false> { - typedef _Class const&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class const&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile&&, true, false> { - typedef _Class volatile&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class volatile&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile&&, true, false> { - typedef _Class volatile&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class volatile&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile&&, true, false> { - typedef _Class const volatile&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param...); + using _ClassType = _Class const volatile&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param...); }; template struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile&&, true, false> { - typedef _Class const volatile&& _ClassType; - typedef _Rp _ReturnType; - typedef _Rp(_FnType)(_Param..., ...); + using _ClassType = _Class const volatile&&; + using _ReturnType = _Rp; + using _FnType = _Rp (*)(_Param..., ...); }; template struct __member_pointer_traits_imp<_Rp _Class::*, false, true> { - typedef _Class _ClassType; - typedef _Rp _ReturnType; + using _ClassType = _Class; + using _ReturnType = _Rp; }; template @@ -270,7 +270,7 @@ struct __member_pointer_class_type template struct __member_pointer_class_type<_Ret _ClassType::*> { - typedef _ClassType type; + using type = _ClassType; }; template struct __nothrow_invokable_r_imp { - typedef __nothrow_invokable_r_imp _ThisT; + using _ThisT = __nothrow_invokable_r_imp; template _LIBCUDACXX_HIDE_FROM_ABI static void __test_noexcept(_Tp) noexcept; diff --git a/libcudacxx/include/cuda/std/__functional/mem_fn.h b/libcudacxx/include/cuda/std/__functional/mem_fn.h index 8327b4edfef..ffbf2c90822 100644 --- a/libcudacxx/include/cuda/std/__functional/mem_fn.h +++ b/libcudacxx/include/cuda/std/__functional/mem_fn.h @@ -33,7 +33,7 @@ class __mem_fn : public __weak_result_type<_Tp> { public: // types - typedef _Tp type; + using type = _Tp; private: type __f_; diff --git a/libcudacxx/include/cuda/std/__functional/operations.h b/libcudacxx/include/cuda/std/__functional/operations.h index a52a0af2840..24ced46b12f 100644 --- a/libcudacxx/include/cuda/std/__functional/operations.h +++ b/libcudacxx/include/cuda/std/__functional/operations.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct _CCCL_TYPE_VISIBILITY_DEFAULT plus : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -52,13 +52,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT plus { return _CUDA_VSTD::forward<_T1>(__t) + _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT minus : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -78,13 +78,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT minus { return _CUDA_VSTD::forward<_T1>(__t) - _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT multiplies : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -104,13 +104,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT multiplies { return _CUDA_VSTD::forward<_T1>(__t) * _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT divides : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -130,13 +130,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT divides { return _CUDA_VSTD::forward<_T1>(__t) / _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT modulus : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -156,13 +156,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT modulus { return _CUDA_VSTD::forward<_T1>(__t) % _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT negate : __unary_function<_Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x) const { @@ -181,7 +181,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT negate { return -_CUDA_VSTD::forward<_Tp>(__x); } - typedef void is_transparent; + using is_transparent = void; }; // Bitwise operations @@ -189,7 +189,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT negate template struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_and : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -209,7 +209,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_and { return _CUDA_VSTD::forward<_T1>(__t) & _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template @@ -233,13 +233,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_not { return ~_CUDA_VSTD::forward<_Tp>(__x); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_or : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -259,13 +259,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_or { return _CUDA_VSTD::forward<_T1>(__t) | _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor : __binary_function<_Tp, _Tp, _Tp> { - typedef _Tp __result_type; // used by valarray + using __result_type = _Tp; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp operator()(const _Tp& __x, const _Tp& __y) const { @@ -285,7 +285,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor { return _CUDA_VSTD::forward<_T1>(__t) ^ _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; // Comparison operations @@ -293,7 +293,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor template struct _CCCL_TYPE_VISIBILITY_DEFAULT equal_to : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -313,13 +313,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT equal_to { return _CUDA_VSTD::forward<_T1>(__t) == _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT not_equal_to : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -339,13 +339,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT not_equal_to { return _CUDA_VSTD::forward<_T1>(__t) != _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT less : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -365,13 +365,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT less { return _CUDA_VSTD::forward<_T1>(__t) < _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT less_equal : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -391,13 +391,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT less_equal { return _CUDA_VSTD::forward<_T1>(__t) <= _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT greater_equal : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -417,13 +417,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT greater_equal { return _CUDA_VSTD::forward<_T1>(__t) >= _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT greater : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -443,7 +443,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT greater { return _CUDA_VSTD::forward<_T1>(__t) > _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; // Logical operations @@ -451,7 +451,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT greater template struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_and : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -471,13 +471,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_and { return _CUDA_VSTD::forward<_T1>(__t) && _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_not : __unary_function<_Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x) const { @@ -496,13 +496,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_not { return !_CUDA_VSTD::forward<_Tp>(__x); } - typedef void is_transparent; + using is_transparent = void; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_or : __binary_function<_Tp, _Tp, bool> { - typedef bool __result_type; // used by valarray + using __result_type = bool; // used by valarray _CCCL_EXEC_CHECK_DISABLE _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __x, const _Tp& __y) const { @@ -522,7 +522,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_or { return _CUDA_VSTD::forward<_T1>(__t) || _CUDA_VSTD::forward<_T2>(__u); } - typedef void is_transparent; + using is_transparent = void; }; _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__functional/reference_wrapper.h b/libcudacxx/include/cuda/std/__functional/reference_wrapper.h index dd8070871a9..7c4b4684079 100644 --- a/libcudacxx/include/cuda/std/__functional/reference_wrapper.h +++ b/libcudacxx/include/cuda/std/__functional/reference_wrapper.h @@ -35,7 +35,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reference_wrapper : public __weak_result_typ { public: // types - typedef _Tp type; + using type = _Tp; private: type* __f_; diff --git a/libcudacxx/include/cuda/std/__functional/unary_function.h b/libcudacxx/include/cuda/std/__functional/unary_function.h index c3509753574..915bd68652b 100644 --- a/libcudacxx/include/cuda/std/__functional/unary_function.h +++ b/libcudacxx/include/cuda/std/__functional/unary_function.h @@ -26,8 +26,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 unary_function { - typedef _Arg argument_type; - typedef _Result result_type; + using argument_type = _Arg; + using result_type = _Result; }; #endif // _CCCL_STD_VER <= 2014 diff --git a/libcudacxx/include/cuda/std/__functional/unwrap_ref.h b/libcudacxx/include/cuda/std/__functional/unwrap_ref.h index 81868eafd5b..ca99e370ddf 100644 --- a/libcudacxx/include/cuda/std/__functional/unwrap_ref.h +++ b/libcudacxx/include/cuda/std/__functional/unwrap_ref.h @@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct __unwrap_reference { - typedef _CCCL_NODEBUG_ALIAS _Tp type; + using type _CCCL_NODEBUG_ALIAS = _Tp; }; template @@ -34,7 +34,7 @@ class reference_wrapper; template struct __unwrap_reference> { - typedef _CCCL_NODEBUG_ALIAS _Tp& type; + using type _CCCL_NODEBUG_ALIAS = _Tp&; }; template diff --git a/libcudacxx/include/cuda/std/__functional/weak_result_type.h b/libcudacxx/include/cuda/std/__functional/weak_result_type.h index 1aff29113a3..dae84e02025 100644 --- a/libcudacxx/include/cuda/std/__functional/weak_result_type.h +++ b/libcudacxx/include/cuda/std/__functional/weak_result_type.h @@ -59,7 +59,7 @@ struct __derives_from_unary_function public: static const bool value = !is_same::value; - typedef decltype(__test((_Tp*) 0)) type; + using type = decltype(__test((_Tp*) 0)); }; template @@ -78,7 +78,7 @@ struct __derives_from_binary_function public: static const bool value = !is_same::value; - typedef decltype(__test((_Tp*) 0)) type; + using type = decltype(__test((_Tp*) 0)); }; template ::value> @@ -266,7 +266,7 @@ struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...) const volatile> template struct __invoke_return { - typedef decltype(_CUDA_VSTD::__invoke(declval<_Tp>(), declval<_Args>()...)) type; + using type = decltype(_CUDA_VSTD::__invoke(declval<_Tp>(), declval<_Args>()...)); }; _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__fwd/string_view.h b/libcudacxx/include/cuda/std/__fwd/string_view.h index 32fd502f818..0b0d9b51858 100644 --- a/libcudacxx/include/cuda/std/__fwd/string_view.h +++ b/libcudacxx/include/cuda/std/__fwd/string_view.h @@ -28,14 +28,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template > class _CCCL_TYPE_VISIBILITY_DEFAULT basic_string_view; -typedef basic_string_view string_view; +using string_view = basic_string_view; #ifndef _LIBCUDACXX_HAS_NO_CHAR8_T -typedef basic_string_view u8string_view; +using u8string_view = basic_string_view; #endif -typedef basic_string_view u16string_view; -typedef basic_string_view u32string_view; +using u16string_view = basic_string_view; +using u32string_view = basic_string_view; #ifndef _LIBCUDACXX_HAS_NO_WIDE_CHARACTERS -typedef basic_string_view wstring_view; +using wstring_view = basic_string_view; #endif // clang-format off diff --git a/libcudacxx/include/cuda/std/__internal/cpp_dialect.h b/libcudacxx/include/cuda/std/__internal/cpp_dialect.h index a4ea71d7ef5..2fec82a7fac 100644 --- a/libcudacxx/include/cuda/std/__internal/cpp_dialect.h +++ b/libcudacxx/include/cuda/std/__internal/cpp_dialect.h @@ -35,12 +35,10 @@ future release. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) // clang-format on -#if _CCCL_STD_VER < 2011 -# error libcu++ requires C++11 or later. -#elif _CCCL_STD_VER == 2011 && !defined(CCCL_IGNORE_DEPRECATED_CPP_11) -LIBCUDACXX_DIALECT_DEPRECATION(C++ 17, C++ 11) -#elif _CCCL_STD_VER == 2014 && !defined(CCCL_IGNORE_DEPRECATED_CPP_14) -LIBCUDACXX_DIALECT_DEPRECATION(C++ 17, C++ 14) -#endif // _CCCL_STD_VER >= 2017 +#ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT +# if _CCCL_STD_VER < 2017 +# error libcu++ requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message. +# endif // _CCCL_STD_VER >= 2017 +#endif // CCCL_IGNORE_DEPRECATED_CPP_DIALECT #endif // _LIBCUDACXX___INTERNAL_CPP_DIALECT_H diff --git a/libcudacxx/include/cuda/std/__iterator/advance.h b/libcudacxx/include/cuda/std/__iterator/advance.h index 17ba093634a..48359585167 100644 --- a/libcudacxx/include/cuda/std/__iterator/advance.h +++ b/libcudacxx/include/cuda/std/__iterator/advance.h @@ -74,8 +74,8 @@ template ::value>> _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void advance(_InputIter& __i, _Distance __orig_n) { - typedef typename iterator_traits<_InputIter>::difference_type _Difference; - _Difference __n = static_cast<_Difference>(_CUDA_VSTD::__convert_to_integral(__orig_n)); + using _Difference = typename iterator_traits<_InputIter>::difference_type; + _Difference __n = static_cast<_Difference>(_CUDA_VSTD::__convert_to_integral(__orig_n)); _CCCL_ASSERT(__n >= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value, "Attempt to advance(it, n) with negative n on a non-bidirectional iterator"); _CUDA_VSTD::__advance(__i, __n, typename iterator_traits<_InputIter>::iterator_category()); diff --git a/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h index dbb8e3f8028..c551e0c7364 100644 --- a/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h @@ -42,16 +42,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT back_insert_iterator _Container* container; public: - typedef output_iterator_tag iterator_category; - typedef void value_type; + using iterator_category = output_iterator_tag; + using value_type = void; #if _CCCL_STD_VER > 2017 - typedef ptrdiff_t difference_type; + using difference_type = ptrdiff_t; #else - typedef void difference_type; + using difference_type = void; #endif - typedef void pointer; - typedef void reference; - typedef _Container container_type; + using pointer = void; + using reference = void; + using container_type = _Container; _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit back_insert_iterator(_Container& __x) : container(_CUDA_VSTD::addressof(__x)) diff --git a/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h index 9918441ea09..c60a65e9db3 100644 --- a/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h @@ -42,16 +42,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT front_insert_iterator _Container* container; public: - typedef output_iterator_tag iterator_category; - typedef void value_type; + using iterator_category = output_iterator_tag; + using value_type = void; #if _CCCL_STD_VER > 2017 - typedef ptrdiff_t difference_type; + using difference_type = ptrdiff_t; #else - typedef void difference_type; + using difference_type = void; #endif - typedef void pointer; - typedef void reference; - typedef _Container container_type; + using pointer = void; + using reference = void; + using container_type = _Container; _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit front_insert_iterator(_Container& __x) : container(_CUDA_VSTD::addressof(__x)) diff --git a/libcudacxx/include/cuda/std/__iterator/insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/insert_iterator.h index 5d63ba91847..227c4983d5b 100644 --- a/libcudacxx/include/cuda/std/__iterator/insert_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/insert_iterator.h @@ -46,16 +46,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT insert_iterator __insert_iterator_iter_t<_Container> iter; public: - typedef output_iterator_tag iterator_category; - typedef void value_type; + using iterator_category = output_iterator_tag; + using value_type = void; #if _CCCL_STD_VER > 2017 - typedef ptrdiff_t difference_type; + using difference_type = ptrdiff_t; #else - typedef void difference_type; + using difference_type = void; #endif - typedef void pointer; - typedef void reference; - typedef _Container container_type; + using pointer = void; + using reference = void; + using container_type = _Container; _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 insert_iterator(_Container& __x, __insert_iterator_iter_t<_Container> __i) diff --git a/libcudacxx/include/cuda/std/__iterator/istream_iterator.h b/libcudacxx/include/cuda/std/__iterator/istream_iterator.h index 687ebe69868..95cb22f734e 100644 --- a/libcudacxx/include/cuda/std/__iterator/istream_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/istream_iterator.h @@ -40,14 +40,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT istream_iterator _CCCL_SUPPRESS_DEPRECATED_POP public: - typedef input_iterator_tag iterator_category; - typedef _Tp value_type; - typedef _Distance difference_type; - typedef const _Tp* pointer; - typedef const _Tp& reference; - typedef _CharT char_type; - typedef _Traits traits_type; - typedef basic_istream<_CharT, _Traits> istream_type; + using iterator_category = input_iterator_tag; + using value_type = _Tp; + using difference_type = _Distance; + using pointer = const _Tp*; + using reference = const _Tp&; + using char_type = _CharT; + using traits_type = _Traits; + using istream_type = basic_istream<_CharT, _Traits>; private: istream_type* __in_stream_; diff --git a/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h b/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h index b716ea77c08..c44b79acb43 100644 --- a/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h @@ -38,16 +38,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT istreambuf_iterator _CCCL_SUPPRESS_DEPRECATED_POP public: - typedef input_iterator_tag iterator_category; - typedef _CharT value_type; - typedef typename _Traits::off_type difference_type; - typedef _CharT* pointer; - typedef _CharT reference; - typedef _CharT char_type; - typedef _Traits traits_type; - typedef typename _Traits::int_type int_type; - typedef basic_streambuf<_CharT, _Traits> streambuf_type; - typedef basic_istream<_CharT, _Traits> istream_type; + using iterator_category = input_iterator_tag; + using value_type = _CharT; + using difference_type = typename _Traits::off_type; + using pointer = _CharT*; + using reference = _CharT; + using char_type = _CharT; + using traits_type = _Traits; + using int_type = typename _Traits::int_type; + using streambuf_type = basic_streambuf<_CharT, _Traits>; + using istream_type = basic_istream<_CharT, _Traits>; private: mutable streambuf_type* __sbuf_; diff --git a/libcudacxx/include/cuda/std/__iterator/iterator.h b/libcudacxx/include/cuda/std/__iterator/iterator.h index a85bbd4ef64..8d3e722507c 100644 --- a/libcudacxx/include/cuda/std/__iterator/iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/iterator.h @@ -28,11 +28,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX17 iterator { - typedef _Tp value_type; - typedef _Distance difference_type; - typedef _Pointer pointer; - typedef _Reference reference; - typedef _Category iterator_category; + using value_type = _Tp; + using difference_type = _Distance; + using pointer = _Pointer; + using reference = _Reference; + using iterator_category = _Category; }; _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h index 27f9262e070..da153007b56 100644 --- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h +++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h @@ -823,11 +823,11 @@ struct __iterator_traits_impl template struct __iterator_traits_impl<_Iter, true> { - typedef typename _Iter::difference_type difference_type; - typedef typename _Iter::value_type value_type; - typedef typename _Iter::pointer pointer; - typedef typename _Iter::reference reference; - typedef typename _Iter::iterator_category iterator_category; + using difference_type = typename _Iter::difference_type; + using value_type = typename _Iter::value_type; + using pointer = typename _Iter::pointer; + using reference = typename _Iter::reference; + using iterator_category = typename _Iter::iterator_category; }; template @@ -855,13 +855,13 @@ template #endif struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits<_Tp*> { - typedef ptrdiff_t difference_type; - typedef remove_cv_t<_Tp> value_type; - typedef _Tp* pointer; - typedef typename add_lvalue_reference<_Tp>::type reference; - typedef random_access_iterator_tag iterator_category; + using difference_type = ptrdiff_t; + using value_type = remove_cv_t<_Tp>; + using pointer = _Tp*; + using reference = typename add_lvalue_reference<_Tp>::type; + using iterator_category = random_access_iterator_tag; #if _CCCL_STD_VER >= 2014 - typedef contiguous_iterator_tag iterator_concept; + using iterator_concept = contiguous_iterator_tag; #endif }; diff --git a/libcudacxx/include/cuda/std/__iterator/move_iterator.h b/libcudacxx/include/cuda/std/__iterator/move_iterator.h index 0436b25b36c..7d7c5b3a600 100644 --- a/libcudacxx/include/cuda/std/__iterator/move_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/move_iterator.h @@ -151,16 +151,16 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator using pointer = _Iter; using reference = iter_rvalue_reference_t<_Iter>; #else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER < 2017 vvv - typedef _Iter iterator_type; - typedef _If<__is_cpp17_random_access_iterator<_Iter>::value, - random_access_iterator_tag, - typename iterator_traits<_Iter>::iterator_category> - iterator_category; - typedef typename iterator_traits::value_type value_type; - typedef typename iterator_traits::difference_type difference_type; - typedef iterator_type pointer; - typedef typename iterator_traits::reference __reference; - typedef conditional_t::value, remove_reference_t<__reference>&&, __reference> reference; + using iterator_type = _Iter; + using iterator_category = + _If<__is_cpp17_random_access_iterator<_Iter>::value, + random_access_iterator_tag, + typename iterator_traits<_Iter>::iterator_category>; + using value_type = typename iterator_traits::value_type; + using difference_type = typename iterator_traits::difference_type; + using pointer = iterator_type; + using __reference = typename iterator_traits::reference; + using reference = conditional_t::value, remove_reference_t<__reference>&&, __reference>; #endif // _CCCL_STD_VER < 2017 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit move_iterator(_Iter __i) diff --git a/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h b/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h index 19d70cbd183..e04f168a3ea 100644 --- a/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h @@ -39,18 +39,18 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT ostream_iterator _CCCL_SUPPRESS_DEPRECATED_POP public: - typedef output_iterator_tag iterator_category; - typedef void value_type; + using iterator_category = output_iterator_tag; + using value_type = void; #if _CCCL_STD_VER > 2017 - typedef ptrdiff_t difference_type; + using difference_type = ptrdiff_t; #else - typedef void difference_type; + using difference_type = void; #endif - typedef void pointer; - typedef void reference; - typedef _CharT char_type; - typedef _Traits traits_type; - typedef basic_ostream<_CharT, _Traits> ostream_type; + using pointer = void; + using reference = void; + using char_type = _CharT; + using traits_type = _Traits; + using ostream_type = basic_ostream<_CharT, _Traits>; private: ostream_type* __out_stream_; diff --git a/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h b/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h index f7a7ae1966d..b62226cb7f0 100644 --- a/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h @@ -38,19 +38,19 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT ostreambuf_iterator _CCCL_SUPPRESS_DEPRECATED_POP public: - typedef output_iterator_tag iterator_category; - typedef void value_type; + using iterator_category = output_iterator_tag; + using value_type = void; #if _CCCL_STD_VER > 2017 - typedef ptrdiff_t difference_type; + using difference_type = ptrdiff_t; #else - typedef void difference_type; + using difference_type = void; #endif - typedef void pointer; - typedef void reference; - typedef _CharT char_type; - typedef _Traits traits_type; - typedef basic_streambuf<_CharT, _Traits> streambuf_type; - typedef basic_ostream<_CharT, _Traits> ostream_type; + using pointer = void; + using reference = void; + using char_type = _CharT; + using traits_type = _Traits; + using streambuf_type = basic_streambuf<_CharT, _Traits>; + using ostream_type = basic_ostream<_CharT, _Traits>; private: streambuf_type* __sbuf_; diff --git a/libcudacxx/include/cuda/std/__iterator/wrap_iter.h b/libcudacxx/include/cuda/std/__iterator/wrap_iter.h index 0760192de83..97e6c47d13c 100644 --- a/libcudacxx/include/cuda/std/__iterator/wrap_iter.h +++ b/libcudacxx/include/cuda/std/__iterator/wrap_iter.h @@ -34,14 +34,14 @@ template class __wrap_iter { public: - typedef _Iter iterator_type; - typedef typename iterator_traits::value_type value_type; - typedef typename iterator_traits::difference_type difference_type; - typedef typename iterator_traits::pointer pointer; - typedef typename iterator_traits::reference reference; - typedef typename iterator_traits::iterator_category iterator_category; + using iterator_type = _Iter; + using value_type = typename iterator_traits::value_type; + using difference_type = typename iterator_traits::difference_type; + using pointer = typename iterator_traits::pointer; + using reference = typename iterator_traits::reference; + using iterator_category = typename iterator_traits::iterator_category; #if _CCCL_STD_VER > 2011 - typedef contiguous_iterator_tag iterator_concept; + using iterator_concept = contiguous_iterator_tag; #endif private: @@ -241,9 +241,9 @@ struct __is_cpp17_contiguous_iterator<__wrap_iter<_It>> : true_type template struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits<__wrap_iter<_It>> { - typedef __wrap_iter<_It> pointer; - typedef typename pointer_traits<_It>::element_type element_type; - typedef typename pointer_traits<_It>::difference_type difference_type; + using pointer = __wrap_iter<_It>; + using element_type = typename pointer_traits<_It>::element_type; + using difference_type = typename pointer_traits<_It>::difference_type; _LIBCUDACXX_HIDE_FROM_ABI constexpr static element_type* to_address(pointer __w) noexcept { diff --git a/libcudacxx/include/cuda/std/__linalg/conj_if_needed.h b/libcudacxx/include/cuda/std/__linalg/conj_if_needed.h new file mode 100644 index 00000000000..c5ddcedcedb --- /dev/null +++ b/libcudacxx/include/cuda/std/__linalg/conj_if_needed.h @@ -0,0 +1,79 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +// ************************************************************************ +//@HEADER + +#ifndef _LIBCUDACXX___LINALG_CONJUGATE_IF_NEEDED_HPP +#define _LIBCUDACXX___LINALG_CONJUGATE_IF_NEEDED_HPP + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 + +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +namespace linalg +{ + +_LIBCUDACXX_BEGIN_NAMESPACE_CPO(__conj_if_needed) + +template +_CCCL_CONCEPT _HasConj = _CCCL_REQUIRES_EXPR((_Type), _Type __a)(static_cast(_CUDA_VSTD::conj(__a))); + +struct __conj_if_needed +{ + template + _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(const _Type& __t) const + { + if constexpr (is_arithmetic_v<_Type> || !_HasConj<_Type>) + { + return __t; + } + else + { + return _CUDA_VSTD::conj(__t); + } + _CCCL_UNREACHABLE(); + } +}; + +_LIBCUDACXX_END_NAMESPACE_CPO + +inline namespace __cpo +{ +_CCCL_GLOBAL_CONSTANT auto conj_if_needed = __conj_if_needed::__conj_if_needed{}; + +} // namespace __cpo +} // end namespace linalg + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 +#endif // _LIBCUDACXX___LINALG_CONJUGATED_HPP diff --git a/libcudacxx/include/cuda/std/__linalg/conjugate_transposed.h b/libcudacxx/include/cuda/std/__linalg/conjugate_transposed.h new file mode 100644 index 00000000000..ab984c78152 --- /dev/null +++ b/libcudacxx/include/cuda/std/__linalg/conjugate_transposed.h @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +// ************************************************************************ +//@HEADER + +#ifndef _LIBCUDACXX___LINALG_CONJUGATE_TRANSPOSED_HPP +#define _LIBCUDACXX___LINALG_CONJUGATE_TRANSPOSED_HPP + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 + +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +namespace linalg +{ + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +conjugate_transposed(mdspan<_ElementType, _Extents, _Layout, _Accessor> __a) +{ + return conjugated(transposed(__a)); +} + +} // end namespace linalg + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 +#endif // _LIBCUDACXX___LINALG_CONJUGATE_TRANSPOSED_HPP diff --git a/libcudacxx/include/cuda/std/__linalg/conjugated.h b/libcudacxx/include/cuda/std/__linalg/conjugated.h new file mode 100644 index 00000000000..8604ccdc1a7 --- /dev/null +++ b/libcudacxx/include/cuda/std/__linalg/conjugated.h @@ -0,0 +1,142 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +// ************************************************************************ +//@HEADER + +#ifndef _LIBCUDACXX___LINALG_CONJUGATED_HPP +#define _LIBCUDACXX___LINALG_CONJUGATED_HPP + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +namespace linalg +{ + +template +class conjugated_accessor +{ +private: + using __nested_element_type = typename _NestedAccessor::element_type; + using __nc_result_type = decltype(conj_if_needed(_CUDA_VSTD::declval<__nested_element_type>())); + +public: + using element_type = add_const_t<__nc_result_type>; + using reference = remove_const_t; + using data_handle_type = typename _NestedAccessor::data_handle_type; + using offset_policy = conjugated_accessor; + + _CCCL_HIDE_FROM_ABI constexpr conjugated_accessor() = default; + + _LIBCUDACXX_HIDE_FROM_ABI constexpr conjugated_accessor(const _NestedAccessor& __acc) + : __nested_accessor_(__acc) + {} + + _CCCL_TEMPLATE(class _OtherNestedAccessor) + _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&) + _CCCL_AND _CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor)) + _LIBCUDACXX_HIDE_FROM_ABI constexpr conjugated_accessor(const conjugated_accessor<_OtherNestedAccessor>& __other) + : __nested_accessor_(__other.nested_accessor()) + {} + + _CCCL_TEMPLATE(class _OtherNestedAccessor) + _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&) + _CCCL_AND(!_CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor))) + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr conjugated_accessor( + const conjugated_accessor<_OtherNestedAccessor>& __other) + : __nested_accessor_(__other.nested_accessor()) + {} + + _LIBCUDACXX_HIDE_FROM_ABI constexpr reference access(data_handle_type __p, size_t __i) const noexcept + { + return conj_if_needed(__nested_element_type(__nested_accessor_.access(__p, __i))); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr typename offset_policy::data_handle_type + offset(data_handle_type __p, size_t __i) const noexcept + { + return __nested_accessor_.offset(__p, __i); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _NestedAccessor& nested_accessor() const noexcept + { + return __nested_accessor_; + } + +private: + _NestedAccessor __nested_accessor_; +}; + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +conjugated(mdspan<_ElementType, _Extents, _Layout, _Accessor> __a) +{ + using __value_type = typename decltype(__a)::value_type; + // Current status of [linalg] only optimizes if _Accessor is conjugated_accessor<_Accessor> for some _Accessor. + // There's a separate specialization for that case below. + + // P3050 optimizes conjugated's accessor type for when we know that it can't be complex: arithmetic types, + // and types for which `conj` is not ADL-findable. + if constexpr (is_arithmetic_v<__value_type> || !__conj_if_needed::_HasConj<__value_type>) + { + return mdspan<_ElementType, _Extents, _Layout, _Accessor>(__a.data_handle(), __a.mapping(), __a.accessor()); + } + else + { + using __return_element_type = typename conjugated_accessor<_Accessor>::element_type; + using __return_accessor_type = conjugated_accessor<_Accessor>; + return mdspan<__return_element_type, _Extents, _Layout, __return_accessor_type>{ + __a.data_handle(), __a.mapping(), __return_accessor_type(__a.accessor())}; + } + _CCCL_UNREACHABLE(); +} + +// Conjugation is self-annihilating +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +conjugated(mdspan<_ElementType, _Extents, _Layout, conjugated_accessor<_NestedAccessor>> __a) +{ + using __return_element_type = typename _NestedAccessor::element_type; + using __return_accessor_type = _NestedAccessor; + return mdspan<__return_element_type, _Extents, _Layout, __return_accessor_type>( + __a.data_handle(), __a.mapping(), __a.accessor().nested_accessor()); +} + +} // end namespace linalg + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 +#endif // _LIBCUDACXX___LINALG_CONJUGATED_HPP diff --git a/libcudacxx/include/cuda/std/__linalg/scaled.h b/libcudacxx/include/cuda/std/__linalg/scaled.h new file mode 100644 index 00000000000..eabd7a6d520 --- /dev/null +++ b/libcudacxx/include/cuda/std/__linalg/scaled.h @@ -0,0 +1,135 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +// ************************************************************************ +//@HEADER + +#ifndef _LIBCUDACXX___LINALG_SCALED_HPP +#define _LIBCUDACXX___LINALG_SCALED_HPP + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +namespace linalg +{ + +template +class scaled_accessor +{ +public: + using element_type = add_const_t< + decltype(_CUDA_VSTD::declval<_ScalingFactor>() * _CUDA_VSTD::declval())>; + using reference = remove_const_t; + using data_handle_type = typename _NestedAccessor::data_handle_type; + using offset_policy = scaled_accessor<_ScalingFactor, typename _NestedAccessor::offset_policy>; + + _CCCL_HIDE_FROM_ABI constexpr scaled_accessor() = default; + + _CCCL_TEMPLATE(class _OtherScalingFactor, class _OtherNestedAccessor) + _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&) + _CCCL_AND _CCCL_TRAIT(is_constructible, _ScalingFactor, _OtherScalingFactor) + _CCCL_AND(!_CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor))) + _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr scaled_accessor( + const scaled_accessor<_OtherScalingFactor, _OtherNestedAccessor>& __other) + : __scaling_factor_(__other.scaling_factor()) + , __nested_accessor_(__other.nested_accessor()) + {} + + _CCCL_TEMPLATE(class _OtherScalingFactor, class _OtherNestedAccessor) + _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _NestedAccessor, const _OtherNestedAccessor&) + _CCCL_AND _CCCL_TRAIT(is_constructible, _ScalingFactor, _OtherScalingFactor) + _CCCL_AND _CCCL_TRAIT(is_convertible, _OtherNestedAccessor, _NestedAccessor)) + _LIBCUDACXX_HIDE_FROM_ABI constexpr scaled_accessor( + const scaled_accessor<_OtherScalingFactor, _OtherNestedAccessor>& __other) + : __scaling_factor_(__other.scaling_factor()) + , __nested_accessor_(__other.nested_accessor()) + {} + + _LIBCUDACXX_HIDE_FROM_ABI constexpr scaled_accessor(const _ScalingFactor& __s, const _NestedAccessor& __a) + : __scaling_factor_(__s) + , __nested_accessor_(__a) + {} + + _LIBCUDACXX_HIDE_FROM_ABI constexpr reference access(data_handle_type __p, size_t __i) const + { + return __scaling_factor_ * typename _NestedAccessor::element_type(__nested_accessor_.access(__p, __i)); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI + typename offset_policy::data_handle_type constexpr offset(data_handle_type __p, size_t __i) const + { + return __nested_accessor_.offset(__p, __i); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _NestedAccessor nested_accessor() const noexcept + { + return __nested_accessor_; + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _ScalingFactor scaling_factor() const noexcept + { + return __scaling_factor_; + } + +private: + _ScalingFactor __scaling_factor_; + _NestedAccessor __nested_accessor_; +}; + +namespace __detail +{ + +template +using __scaled_element_type = add_const_t::element_type>; + +} // namespace __detail + +template +_CCCL_NODISCARD +_LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan<__detail::__scaled_element_type<_ScalingFactor, _Accessor>, + _Extents, + _Layout, + scaled_accessor<_ScalingFactor, _Accessor>> +scaled(_ScalingFactor __scaling_factor, mdspan<_ElementType, _Extents, _Layout, _Accessor> __x) +{ + using __acc_type = scaled_accessor<_ScalingFactor, _Accessor>; + return {__x.data_handle(), __x.mapping(), __acc_type{__scaling_factor, __x.accessor()}}; +} + +} // end namespace linalg + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 +#endif // _LIBCUDACXX___LINALG_SCALED_HPP diff --git a/libcudacxx/include/cuda/std/__linalg/transposed.h b/libcudacxx/include/cuda/std/__linalg/transposed.h new file mode 100644 index 00000000000..707cfa8bfe8 --- /dev/null +++ b/libcudacxx/include/cuda/std/__linalg/transposed.h @@ -0,0 +1,330 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +// ************************************************************************ +//@HEADER + +#ifndef _LIBCUDACXX___LINALG_TRANSPOSED_HPP +#define _LIBCUDACXX___LINALG_TRANSPOSED_HPP + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#if defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 + +# include +# include +# include +# include +# include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +namespace linalg +{ + +namespace __detail +{ +// This struct helps us impose the rank constraint on the __type alias itself. +_CCCL_TEMPLATE(class _Extents) +_CCCL_REQUIRES((_Extents::rank() == 2)) +struct __transpose_extents_t_impl +{ + using __type = extents; +}; + +template +using __transpose_extents_t = typename __transpose_extents_t_impl<_Extents>::__type; + +_CCCL_TEMPLATE(class _Extents) +_CCCL_REQUIRES((_Extents::rank() == 2)) +_LIBCUDACXX_HIDE_FROM_ABI constexpr __transpose_extents_t<_Extents> __transpose_extents(const _Extents& __e) +{ + static_assert(is_same_v::index_type, typename _Extents::index_type>, + "Please fix __transpose_extents_t to account for P2553, which adds a template parameter SizeType to " + "extents."); + constexpr size_t __ext0 = _Extents::static_extent(0); + constexpr size_t __ext1 = _Extents::static_extent(1); + if constexpr (__ext0 == dynamic_extent) + { + if constexpr (__ext1 == dynamic_extent) + { + return __transpose_extents_t<_Extents>{__e.extent(1), __e.extent(0)}; + } + else + { + return __transpose_extents_t<_Extents>{/* __e.extent(1), */ __e.extent(0)}; + } + } + else + { + if constexpr (__ext1 == dynamic_extent) + { + return __transpose_extents_t<_Extents>{__e.extent(1) /* , __e.extent(0) */}; + } + else + { + return __transpose_extents_t<_Extents>{}; // all extents are static + } + } + _CCCL_UNREACHABLE(); // GCC9 workaround +} + +} // namespace __detail + +template +class layout_transpose +{ +public: + using nested_layout_type = _Layout; + + template + struct mapping + { + private: + using __nested_mapping_type = typename _Layout::template mapping<__detail::__transpose_extents_t<_Extents>>; + + static constexpr bool __required_span_size_noexcept = noexcept(__nested_mapping_type{}.required_span_size()); + + static constexpr bool __is_nested_unique_noexcept = noexcept(__nested_mapping_type{}.is_unique()); + + static constexpr bool __is_exhaustive_noexcept = noexcept(__nested_mapping_type{}.is_exhaustive()); + + static constexpr bool __is_strided_noexcept = noexcept(__nested_mapping_type{}.is_strided()); + + public: + using extents_type = _Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_transpose; + + _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit mapping(const __nested_mapping_type& __map) + : __nested_mapping_(__map) + , __extents_(__detail::__transpose_extents(__map.extents())) + {} + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept + { + return __extents_; + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type required_span_size() const + noexcept(__required_span_size_noexcept) + { + return __nested_mapping_.required_span_size(); + } + + _CCCL_TEMPLATE(class _IndexType0, class _IndexType1) + _CCCL_REQUIRES(_CCCL_TRAIT(is_convertible, _IndexType0, index_type) + _CCCL_AND _CCCL_TRAIT(is_convertible, _IndexType1, index_type)) + _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type operator()(_IndexType0 __i, _IndexType1 __j) const + { + return __nested_mapping_(__j, __i); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const __nested_mapping_type& nested_mapping() const noexcept + { + return __nested_mapping_; + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_unique() noexcept + { + return __nested_mapping_type::is_always_unique(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() noexcept + { + return __nested_mapping_type::is_always_contiguous(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_strided() noexcept + { + return __nested_mapping_type::is_always_strided(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_unique() const noexcept(__is_nested_unique_noexcept) + { + return __nested_mapping_.is_unique(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_exhaustive() const noexcept(__is_exhaustive_noexcept) + { + return __nested_mapping_.is_exhaustive(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_strided() const noexcept(__is_strided_noexcept) + { + return __nested_mapping_.is_strided(); + } + + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type stride(size_t __r) const + { + _CCCL_ASSERT(this->is_strided(), "layout must be strided"); + _CCCL_ASSERT(__r < extents_type::rank(), "rank must be less than extents rank"); + return __nested_mapping_.stride(__r == 0 ? 1 : 0); + } + + template + _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool + operator==(const mapping& __lhs, const mapping<_OtherExtents>& __rhs) noexcept + { + return __lhs.__nested_mapping_ == __rhs.__nested_mapping_; + } + + template + _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool + operator!=(const mapping& __lhs, const mapping<_OtherExtents>& __rhs) noexcept + { + return __lhs.__nested_mapping_ != __rhs.__nested_mapping_; + } + + private: + __nested_mapping_type __nested_mapping_; + extents_type __extents_; + }; +}; + +namespace __detail +{ + +template +struct __transposed_element_accessor +{ + using __element_type = _ElementType; + using __accessor_type = _Accessor; + + _LIBCUDACXX_HIDE_FROM_ABI static constexpr __accessor_type __accessor(const _Accessor& __a) + { + return __accessor_type(__a); + } +}; + +template +struct __transposed_element_accessor<_ElementType, default_accessor<_ElementType>> +{ + using __element_type = _ElementType; + using __accessor_type = default_accessor<__element_type>; + + _LIBCUDACXX_HIDE_FROM_ABI static constexpr __accessor_type __accessor(const default_accessor<_ElementType>& __a) + { + return __accessor_type(__a); + } +}; + +template +struct __transposed_layout +{ + using __layout_type = layout_transpose<_Layout>; + + template + _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __mapping(const __OriginalMapping& __orig_map) + { + using __extents_type = __transpose_extents_t; + using __return_mapping_type = typename __layout_type::template __mapping<__extents_type>; + return __return_mapping_type{__orig_map}; + } +}; + +template <> +struct __transposed_layout +{ + using __layout_type = layout_right; + + template + _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto + __mapping(const typename layout_left::template mapping<_OriginalExtents>& __orig_map) + { + using __original_mapping_type = typename layout_left::template mapping<_OriginalExtents>; + using __extents_type = __transpose_extents_t; + using __return_mapping_type = typename __layout_type::template mapping<__extents_type>; + return __return_mapping_type{__transpose_extents(__orig_map.extents())}; + } +}; + +template <> +struct __transposed_layout +{ + using __layout_type = layout_left; + + template + _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto + __mapping(const typename layout_right::template mapping<_OriginalExtents>& __orig_map) + { + using __original_mapping_type = typename layout_right::template mapping<_OriginalExtents>; + using __extents_type = __transpose_extents_t; + using __return_mapping_type = typename __layout_type::template mapping<__extents_type>; + return __return_mapping_type{__transpose_extents(__orig_map.extents())}; + } +}; + +template <> +struct __transposed_layout +{ + using __layout_type = layout_stride; + + template + _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto + __mapping(const typename layout_stride::template mapping<_OriginalExtents>& __orig_map) + { + using __original_mapping_type = typename layout_stride::template mapping<_OriginalExtents>; + using __original_extents_type = typename __original_mapping_type::extents_type; + using __extents_type = __transpose_extents_t<__original_extents_type>; + using __return_mapping_type = typename __layout_type::template mapping<__extents_type>; + return __return_mapping_type{ + __transpose_extents(__orig_map.extents()), + array{ + __orig_map.stride(1), __orig_map.stride(0)}}; + } +}; + +// TODO add support for padded layouts + +template +struct __transposed_layout> +{ + using __layout_type = _NestedLayout; +}; + +} // namespace __detail + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto +transposed(mdspan<_ElementType, _Extents, _Layout, _Accessor> __a) +{ + using __element_type = typename __detail::__transposed_element_accessor<_ElementType, _Accessor>::__element_type; + using __layout_type = typename __detail::__transposed_layout<_Layout>::__layout_type; + using __accessor_type = typename __detail::__transposed_element_accessor<_ElementType, _Accessor>::__accessor_type; + auto __mapping = __detail::__transposed_layout<_Layout>::__mapping(__a.mapping()); + auto __accessor = __detail::__transposed_element_accessor<_ElementType, _Accessor>::__accessor(__a.accessor()); + return mdspan<__element_type, typename decltype(__mapping)::extents_type, __layout_type, __accessor_type>{ + __a.data_handle(), __mapping, __accessor}; +} + +} // end namespace linalg + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // defined(__cccl_lib_mdspan) && _CCCL_STD_VER >= 2017 +#endif // _LIBCUDACXX___LINALG_TRANSPOSED_HPP diff --git a/libcudacxx/include/cuda/std/__memory/allocator.h b/libcudacxx/include/cuda/std/__memory/allocator.h index c771226e191..ae90ebc0d72 100644 --- a/libcudacxx/include/cuda/std/__memory/allocator.h +++ b/libcudacxx/include/cuda/std/__memory/allocator.h @@ -49,14 +49,14 @@ template <> class _CCCL_TYPE_VISIBILITY_DEFAULT allocator { public: - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef void* pointer; - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void* const_pointer; - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef void value_type; + using pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = void*; + using const_pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = const void*; + using value_type _LIBCUDACXX_DEPRECATED_IN_CXX17 = void; template struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind { - typedef allocator<_Up> other; + using other = allocator<_Up>; }; }; @@ -64,14 +64,14 @@ template <> class _CCCL_TYPE_VISIBILITY_DEFAULT allocator { public: - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void* pointer; - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void* const_pointer; - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void value_type; + using pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = const void*; + using const_pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = const void*; + using value_type _LIBCUDACXX_DEPRECATED_IN_CXX17 = const void; template struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind { - typedef allocator<_Up> other; + using other = allocator<_Up>; }; }; #endif // _CCCL_STD_VER <= 2017 @@ -109,11 +109,11 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind { - typedef allocator<_Up> other; + using other = allocator<_Up>; }; _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI pointer address(reference __x) const noexcept @@ -213,11 +213,11 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator static_assert(!_CCCL_TRAIT(is_volatile, _Tp), "std::allocator does not support volatile types"); public: - typedef size_t size_type; - typedef ptrdiff_t difference_type; - typedef const _Tp value_type; - typedef true_type propagate_on_container_move_assignment; - typedef true_type is_always_equal; + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = const _Tp; + using propagate_on_container_move_assignment = true_type; + using is_always_equal = true_type; _CCCL_CONSTEXPR_CXX20 allocator() noexcept = default; @@ -262,15 +262,15 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator // C++20 Removed members #if _CCCL_STD_VER <= 2017 - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp* pointer; - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer; - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp& reference; - _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const _Tp& const_reference; + using pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = const _Tp*; + using const_pointer _LIBCUDACXX_DEPRECATED_IN_CXX17 = const _Tp*; + using reference _LIBCUDACXX_DEPRECATED_IN_CXX17 = const _Tp&; + using const_reference _LIBCUDACXX_DEPRECATED_IN_CXX17 = const _Tp&; template struct _LIBCUDACXX_DEPRECATED_IN_CXX17 rebind { - typedef allocator<_Up> other; + using other = allocator<_Up>; }; _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI const_pointer address(const_reference __x) const noexcept diff --git a/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h b/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h index ae88ce57615..d1ca1ab83a7 100644 --- a/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h +++ b/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h @@ -42,10 +42,10 @@ _CCCL_INLINE_VAR constexpr allocator_arg_t allocator_arg = allocator_arg_t(); template struct __uses_alloc_ctor_imp { - typedef _CCCL_NODEBUG_ALIAS remove_cvref_t<_Alloc> _RawAlloc; - static const bool __ua = uses_allocator<_Tp, _RawAlloc>::value; - static const bool __ic = is_constructible<_Tp, allocator_arg_t, _Alloc, _Args...>::value; - static const int value = __ua ? 2 - __ic : 0; + using _RawAlloc _CCCL_NODEBUG_ALIAS = remove_cvref_t<_Alloc>; + static const bool __ua = uses_allocator<_Tp, _RawAlloc>::value; + static const bool __ic = is_constructible<_Tp, allocator_arg_t, _Alloc, _Args...>::value; + static const int value = __ua ? 2 - __ic : 0; }; template diff --git a/libcudacxx/include/cuda/std/__memory/allocator_destructor.h b/libcudacxx/include/cuda/std/__memory/allocator_destructor.h index e6fc850a086..a850e834282 100644 --- a/libcudacxx/include/cuda/std/__memory/allocator_destructor.h +++ b/libcudacxx/include/cuda/std/__memory/allocator_destructor.h @@ -29,11 +29,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template class __allocator_destructor { - typedef _CCCL_NODEBUG_ALIAS allocator_traits<_Alloc> __alloc_traits; + using __alloc_traits _CCCL_NODEBUG_ALIAS = allocator_traits<_Alloc>; public: - typedef _CCCL_NODEBUG_ALIAS typename __alloc_traits::pointer pointer; - typedef _CCCL_NODEBUG_ALIAS typename __alloc_traits::size_type size_type; + using pointer _CCCL_NODEBUG_ALIAS = typename __alloc_traits::pointer; + using size_type _CCCL_NODEBUG_ALIAS = typename __alloc_traits::size_type; private: _Alloc& __alloc_; diff --git a/libcudacxx/include/cuda/std/__memory/allocator_traits.h b/libcudacxx/include/cuda/std/__memory/allocator_traits.h index 035731687a3..a22e5b09695 100644 --- a/libcudacxx/include/cuda/std/__memory/allocator_traits.h +++ b/libcudacxx/include/cuda/std/__memory/allocator_traits.h @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include _CCCL_PUSH_MACROS @@ -567,7 +567,7 @@ using __rebind_alloc _CCCL_NODEBUG_ALIAS = typename _Traits::template rebind_all template struct __rebind_alloc_helper { - typedef _CCCL_NODEBUG_ALIAS typename _Traits::template rebind_alloc<_Tp> type; + using type _CCCL_NODEBUG_ALIAS = typename _Traits::template rebind_alloc<_Tp>; }; #undef _LIBCUDACXX_ALLOCATOR_TRAITS_HAS_XXX diff --git a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h index 5752a48ec04..445c0166779 100644 --- a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h +++ b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h @@ -36,7 +36,7 @@ struct __builtin_new_allocator { struct __builtin_new_deleter { - typedef void* pointer_type; + using pointer_type = void*; _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __builtin_new_deleter(size_t __size, size_t __align) noexcept : __size_(__size) @@ -53,7 +53,7 @@ struct __builtin_new_allocator size_t __align_; }; - typedef unique_ptr __holder_t; + using __holder_t = unique_ptr; _LIBCUDACXX_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align) { diff --git a/libcudacxx/include/cuda/std/__memory/pointer_traits.h b/libcudacxx/include/cuda/std/__memory/pointer_traits.h index d102dde7a74..cd04ccfaad9 100644 --- a/libcudacxx/include/cuda/std/__memory/pointer_traits.h +++ b/libcudacxx/include/cuda/std/__memory/pointer_traits.h @@ -49,19 +49,19 @@ struct __pointer_traits_element_type; template struct __pointer_traits_element_type<_Ptr, true> { - typedef _CCCL_NODEBUG_ALIAS typename _Ptr::element_type type; + using type _CCCL_NODEBUG_ALIAS = typename _Ptr::element_type; }; template