diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5e2f46714d9..e0b315f34fc 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,10 +8,9 @@ notebooks/ @rapidsai/cudf-python-codeowners python/dask_cudf/ @rapidsai/cudf-dask-codeowners #cmake code owners -cpp/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -**/cmake/ @rapidsai/cudf-cmake-codeowners -*.cmake @rapidsai/cudf-cmake-codeowners +CMakeLists.txt @rapidsai/cudf-cmake-codeowners +**/cmake/ @rapidsai/cudf-cmake-codeowners +*.cmake @rapidsai/cudf-cmake-codeowners #java code owners java/ @rapidsai/cudf-java-codeowners diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index fb7182f4133..65aebfb7f8c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7c0bd6d52e2..e955b8f1f80 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -13,6 +13,7 @@ jobs: # Please keep pr-builder as the top job here pr-builder: needs: + - check-nightly-ci - changed-files - checks - conda-cpp-build @@ -52,7 +53,20 @@ jobs: OTEL_SERVICE_NAME: 'pr-cudf' steps: - name: Telemetry setup + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main + check-nightly-ci: + # Switch to ubuntu-latest once it defaults to a version of Ubuntu that + # provides at least Python 3.11 (see + # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat) + runs-on: ubuntu-24.04 + env: + RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Check if nightly CI is passing + uses: rapidsai/shared-actions/check_nightly_success/dispatch@main + with: + repo: cudf changed-files: secrets: inherit needs: telemetry-setup @@ -172,7 +186,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" static-configure: needs: checks @@ -193,7 +207,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -203,7 +217,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks @@ -327,16 +341,11 @@ jobs: run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" telemetry-summarize: - runs-on: ubuntu-latest + # This job must use a self-hosted runner to record telemetry traces. + runs-on: linux-amd64-cpu4 needs: pr-builder - if: always() + if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} continue-on-error: true steps: - - name: Load stashed telemetry env vars - uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main - with: - load_service_name: true - name: Telemetry summarize - uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main - with: - cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}" + uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 858352f515d..dc82c17022a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit @@ -94,7 +94,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -106,7 +106,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 3b972f31ca4..01dd2436beb 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 4290d013fe4..52d8f659611 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -35,6 +35,10 @@ rapids-mamba-retry install \ export RAPIDS_DOCS_DIR="$(mktemp -d)" +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + rapids-logger "Build CPP docs" pushd cpp/doxygen aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag" @@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" popd RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs + +exit ${EXITCODE} diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index af49942c8cd..d80e4fef0d0 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -1,11 +1,13 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -euo pipefail package_name="libcudf" package_dir="python/libcudf" +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + rapids-logger "Generating build requirements" rapids-dependency-file-generator \ @@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" ./ci/build_wheel.sh "${package_name}" "${package_dir}" -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - mkdir -p ${package_dir}/final_dist python -m auditwheel repair \ --exclude libnvcomp.so.4 \ diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh index f8ddbaba0f3..30e3ffc9a43 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -26,6 +26,8 @@ main() { LIBS=${LIBS#[} LIBS=${LIBS%]} + ANY_FAILURES=0 + for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for library $lib" @@ -56,10 +58,6 @@ main() { rapids-logger "Check GPU usage" nvidia-smi - EXITCODE=0 - trap "EXITCODE=1" ERR - set +e - rapids-logger "pytest ${lib}" NUM_PROCESSES=8 @@ -72,12 +70,20 @@ main() { fi done + EXITCODE=0 + trap "EXITCODE=1" ERR + set +e + TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib} + set -e rapids-logger "Test script exiting with value: ${EXITCODE}" + if [[ ${EXITCODE} != 0 ]]; then + ANY_FAILURES=1 + fi done - exit ${EXITCODE} + exit ${ANY_FAILURES} } main "$@" diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index db86721755d..3c6dba72164 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Support invoking test_python_cudf.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ @@ -24,8 +24,8 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -rapids-logger "pytest dask_cudf (dask-expr)" -DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ +rapids-logger "pytest dask_cudf" +./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ @@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ --cov-report=term -rapids-logger "pytest dask_cudf (legacy)" -DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . - rapids-logger "pytest cudf_kafka" ./ci/run_cudf_kafka_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index e15949f4bdb..44f430ce98d 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -eou pipefail @@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" # Run tests in dask_cudf/tests and dask_cudf/io/tests -rapids-logger "pytest dask_cudf (dask-expr)" +rapids-logger "pytest dask_cudf" pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ +python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ . popd - -# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy) -rapids-logger "pytest dask_cudf (legacy)" -pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . -popd diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 87c40421be0..a8e5018b283 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.8.5,<12.0a0 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit @@ -55,7 +55,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13,<0.0.18 +- numba-cuda>=0.2.0,<0.3.0 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 @@ -66,12 +66,12 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 -- pynvml>=11.4.1,<12.0.0a0 +- pynvml>=12.0.0,<13.0.0a0 - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov @@ -87,7 +87,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 0935de96d19..6dc99b14f5d 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.6.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13,<0.0.18 +- numba-cuda>=0.2.0,<0.3.0 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.1.0.6 @@ -64,12 +64,12 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 - pynvjitlink>=0.0.0a0 -- pynvml>=11.4.1,<12.0.0a0 +- pynvml>=12.0.0,<13.0.0a0 - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov @@ -86,7 +86,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index b6c03dc1bc2..7a0005497df 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.15 + - polars >=1.11,<1.18 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e52b8c5f2a0..b34496cc256 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.0.13,<0.0.18 + - numba-cuda >=0.2.0,<0.3.0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.8.5,<12.0a0 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.6.2,<13.0a0 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 74ecded8ead..a476d5d53df 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - cudf ={{ version }} - - pynvml >=11.4.1,<12.0.0a0 + - pynvml >=12.0.0,<13.0.0a0 - rapids-dask-dependency ={{ minor_version }} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c78ca326005..00020fdf6b8 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -31,9 +31,6 @@ fmt_version: flatbuffers_version: - "=24.3.25" -spdlog_version: - - ">=1.14.1,<1.15" - nvcomp_version: - "=4.1.0.6" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 1c2e9e8dd98..b585aafc397 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -68,7 +68,6 @@ requirements: - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} - flatbuffers {{ flatbuffers_version }} - - spdlog {{ spdlog_version }} - zlib {{ zlib_version }} outputs: diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 3d965f30986..08eab363af0 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -83,9 +83,9 @@ requirements: - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.8.5,<12.0a0 {% else %} - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.6.2,<13.0a0 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 12e6826f301..9dabe4e8800 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -273,6 +273,11 @@ endif() # add third party dependencies using CPM rapids_cpm_init() + +include(${rapids-cmake-dir}/cpm/rapids_logger.cmake) +rapids_cpm_rapids_logger() +rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN) + # find jitify include(cmake/thirdparty/get_jitify.cmake) # find NVTX @@ -299,8 +304,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake) include(cmake/thirdparty/get_kvikio.cmake) # find fmt include(cmake/thirdparty/get_fmt.cmake) -# find spdlog -include(cmake/thirdparty/get_spdlog.cmake) # find nanoarrow include(cmake/thirdparty/get_nanoarrow.cmake) # find thread_pool @@ -440,7 +443,6 @@ add_library( src/groupby/sort/group_quantiles.cu src/groupby/sort/group_std.cu src/groupby/sort/group_sum.cu - src/groupby/sort/scan.cpp src/groupby/sort/group_count_scan.cu src/groupby/sort/group_max_scan.cu src/groupby/sort/group_min_scan.cu @@ -448,6 +450,8 @@ add_library( src/groupby/sort/group_rank_scan.cu src/groupby/sort/group_replace_nulls.cu src/groupby/sort/group_sum_scan.cu + src/groupby/sort/host_udf_aggregation.cpp + src/groupby/sort/scan.cpp src/groupby/sort/sort_helper.cu src/hash/md5_hash.cu src/hash/murmurhash3_x86_32.cu @@ -457,6 +461,7 @@ add_library( src/hash/sha256_hash.cu src/hash/sha384_hash.cu src/hash/sha512_hash.cu + src/hash/xxhash_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/arrow_utilities.cpp @@ -772,7 +777,6 @@ add_library( src/utilities/default_stream.cpp src/utilities/host_memory.cpp src/utilities/linked_column.cpp - src/utilities/logger.cpp src/utilities/prefetch.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp @@ -910,11 +914,10 @@ if(CUDF_LARGE_STRINGS_DISABLED) target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED) endif() -# Define RMM logging level -target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL") - -# Define spdlog level -target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Define logging level +target_compile_definitions( + cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=CUDF_LOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}" +) # Enable remote IO through KvikIO target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) @@ -928,14 +931,17 @@ if(TARGET CUDA::cuFile${_cufile_suffix}) target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND) endif() +# Remove this after upgrading to a CCCL that has a proper CMake option. See +# https://github.com/NVIDIA/cccl/pull/2844 +target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ - spdlog::spdlog_header_only + PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ cudf_logger PRIVATE $ cuco::cuco ZLIB::ZLIB @@ -944,6 +950,7 @@ target_link_libraries( $ nanoarrow rmm::rmm_logger_impl + cudf_logger_impl ) # Add Conda library, and include paths if specified @@ -1099,7 +1106,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) ${_tgt} PRIVATE "$:${CUDF_CXX_FLAGS}>>" ) target_include_directories(${_tgt} PRIVATE "$") - target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm) + target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl) if(CUDF_BUILD_STACKTRACE_DEBUG) target_link_libraries(${_tgt} PRIVATE cudf_backtrace) endif() diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8e5ea900efa..0ff712c1c77 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) endfunction() # ################################################################################################## -# * column benchmarks ----------------------------------------------------------------------------- -ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp) +# * copying benchmarks +# ----------------------------------------------------------------------------- +ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp) # ################################################################################################## # * gather benchmark ------------------------------------------------------------------------------ @@ -351,11 +352,18 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/subword.cpp) - ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH + text/edit_distance.cpp + text/hash_ngrams.cpp + text/jaccard.cpp + text/minhash.cpp + text/ngrams.cpp + text/normalize.cpp + text/replace.cpp + text/subword.cpp + text/tokenize.cpp + text/vocab.cpp ) # ################################################################################################## @@ -417,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp) # --------------------------------------------------------------------------------- ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp) +# ################################################################################################## +# * rolling benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp) + add_custom_target( run_benchmarks DEPENDS CUDF_BENCHMARKS diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp deleted file mode 100644 index 51106c72137..00000000000 --- a/cpp/benchmarks/column/concatenate.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -class Concatenate : public cudf::benchmark {}; - -template -static void BM_concatenate(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - - auto input = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - auto input_columns = input->view(); - std::vector column_views(input_columns.begin(), input_columns.end()); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); -} - -#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_BENCHMARK_DEFINE(int64_t, false) -CONCAT_BENCHMARK_DEFINE(int64_t, true) - -template -static void BM_concatenate_tables(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - cudf::size_type const num_tables = state.range(2); - - std::vector> tables(num_tables); - std::generate_n(tables.begin(), num_tables, [&]() { - return create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - }); - - // Generate table views - std::vector table_views(num_tables); - std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable { - return table->view(); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(table_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); -} - -#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_tables(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) - -class ConcatenateStrings : public cudf::benchmark {}; - -template -static void BM_concatenate_strings(benchmark::State& state) -{ - using column_wrapper = cudf::test::strings_column_wrapper; - - auto const num_rows = state.range(0); - auto const num_chars = state.range(1); - auto const num_cols = state.range(2); - - std::string str(num_chars, 'a'); - - // Create owning columns - std::vector columns; - columns.reserve(num_cols); - std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() { - auto iter = thrust::make_constant_iterator(c_str); - if (Nullable) { - auto count_it = thrust::make_counting_iterator(0); - auto valid_iter = - thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; }); - return column_wrapper(iter, iter + num_rows, valid_iter); - } else { - return column_wrapper(iter, iter + num_rows); - } - }); - - // Generate column views - std::vector column_views; - column_views.reserve(columns.size()); - std::transform( - columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) { - return static_cast(col); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * - (sizeof(int32_t) + num_chars)); // offset + chars -} - -#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_strings(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_STRINGS_BENCHMARK_DEFINE(false) -CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp new file mode 100644 index 00000000000..586b479d0ad --- /dev/null +++ b/cpp/benchmarks/copying/concatenate.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +#include + +#include + +static void bench_concatenate(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = static_cast(state.get_float64("nulls")); + + auto input = create_sequence_table( + cycle_dtypes({cudf::type_to_id()}, num_cols), row_count{num_rows}, nulls); + auto input_columns = input->view(); + auto column_views = std::vector(input_columns.begin(), input_columns.end()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_reads(num_rows * num_cols); + state.add_global_memory_writes(num_rows * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate) + .set_name("concatenate") + .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144}) + .add_int64_axis("num_cols", {2, 8, 64, 512, 1024}) + .add_float64_axis("nulls", {0.0, 0.3}); + +static void bench_concatenate_strings(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const nulls = static_cast(state.get_float64("nulls")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .null_probability(nulls); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = column->view(); + + auto column_views = std::vector(num_cols, input); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const sv = cudf::strings_column_view(input); + state.add_global_memory_reads(sv.chars_size(stream) * num_cols); + state.add_global_memory_writes(sv.chars_size(stream) * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate_strings) + .set_name("concatenate_strings") + .add_int64_axis("num_rows", {256, 512, 4096, 16384}) + .add_int64_axis("num_cols", {2, 8, 64, 256}) + .add_int64_axis("row_width", {32, 128}) + .add_float64_axis("nulls", {0.0, 0.3}); diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 45b46005c47..38a21961735 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu index 3502cbcea2a..1085b03ac7b 100644 --- a/cpp/benchmarks/join/distinct_join.cu +++ b/cpp/benchmarks/join/distinct_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,13 +23,8 @@ void distinct_inner_join(nvbench::state& state, auto join = [](cudf::table_view const& probe_input, cudf::table_view const& build_input, cudf::null_equality compare_nulls) { - auto const has_nulls = - cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls}; - return hj_obj.inner_join(); + auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls}; + return hj_obj.inner_join(probe_input); }; BM_join(state, join); @@ -42,13 +37,8 @@ void distinct_left_join(nvbench::state& state, auto join = [](cudf::table_view const& probe_input, cudf::table_view const& build_input, cudf::null_equality compare_nulls) { - auto const has_nulls = - cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls}; - return hj_obj.left_join(); + auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls}; + return hj_obj.left_join(probe_input); }; BM_join(state, join); diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp new file mode 100644 index 00000000000..04afe5ac661 --- /dev/null +++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto keys = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + return cudf::sort(cudf::table_view{{keys->view()}}); + }(); + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = cudf::grouped_rolling_window( + keys->view(), vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_grouped_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 28}) + .add_int64_axis("preceding_size", {1, 10}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1}) + .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000}); diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp new file mode 100644 index 00000000000..af9ecd6a26f --- /dev/null +++ b/cpp/benchmarks/rolling/rolling_sum.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +template +void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + + auto vals = [&]() { + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto preceding = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) { + return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto following = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) { + return std::max(-i - 1, std::min(following_size, num_rows - i - 1)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_fixed_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {1, 10, 100}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1, 20}); + +NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_variable_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {10, 100}) + .add_int64_axis("following_size", {2}); diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp index fa017ca9e29..267aa3a93f3 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp @@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list) cudf::size_type const num_rows = state.get_int64("NumRows"); auto const keep = get_keep(state.get_string("keep")); cudf::size_type const cardinality = state.get_int64("cardinality"); + auto const null_probability = state.get_float64("null_probability"); if (cardinality > num_rows) { state.skip("cardinality > num_rows"); @@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) data_profile profile = data_profile_builder() .cardinality(cardinality) - .null_probability(0.01) + .null_probability(null_probability) .distribution(cudf::type_to_id(), distribution_id::UNIFORM, static_cast(0), @@ -65,6 +66,7 @@ using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.01}) .add_string_axis("keep", {"any", "first", "last", "none"}) .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index cd4d3ca964b..9750475a079 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -24,18 +24,14 @@ void bench_case(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const max_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const encoding = state.get_string("encoding"); - if (static_cast(n_rows) * static_cast(max_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); auto col_view = column->view(); @@ -74,6 +70,7 @@ void bench_case(nvbench::state& state) NVBENCH_BENCH(bench_case) .set_name("case") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index eec9a5f54d7..abc5254392e 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -25,16 +25,12 @@ static void bench_char_types(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const api_type = state.get_string("api"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state) NVBENCH_BENCH(bench_char_types) .set_name("char_types") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index a73017dda18..e3940cbc0c7 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43" static void bench_contains(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto col = create_string_column(n_rows, row_width, hit_rate); + auto col = create_string_column(num_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; @@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state) NVBENCH_BENCH(bench_contains) .set_name("contains") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {50, 100}) // percentage .add_int64_axis("pattern", {0, 1, 2}); diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp index e06cca497c2..5a5743dfddf 100644 --- a/cpp/benchmarks/string/copy_if_else.cpp +++ b/cpp/benchmarks/string/copy_if_else.cpp @@ -25,15 +25,11 @@ static void bench_copy(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const str_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const source_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile); auto const target_table = @@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state) NVBENCH_BENCH(bench_copy) .set_name("copy_if_else") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp index af217a49195..7e7353a0e78 100644 --- a/cpp/benchmarks/string/copy_range.cpp +++ b/cpp/benchmarks/string/copy_range.cpp @@ -25,16 +25,12 @@ static void bench_copy_range(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const source_tables = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile); @@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state) NVBENCH_BENCH(bench_copy_range) .set_name("copy_range") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp index f964bc5d224..cf90e316f71 100644 --- a/cpp/benchmarks/string/count.cpp +++ b/cpp/benchmarks/string/count.cpp @@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"}; static void bench_count(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state) NVBENCH_BENCH(bench_count) .set_name("count") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("pattern", {0, 1}); diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index af4fedb5799..d6866598ff4 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto groups = static_cast(state.get_int64("groups")); std::default_random_engine generator; @@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state) NVBENCH_BENCH(bench_extract) .set_name("extract") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("groups", {1, 2, 4}); diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp index 6dcf731ad3c..27652193b7b 100644 --- a/cpp/benchmarks/string/join_strings.cpp +++ b/cpp/benchmarks/string/join_strings.cpp @@ -25,15 +25,11 @@ static void bench_join(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state) NVBENCH_BENCH(bench_join) .set_name("strings_join") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp index a19060ead3b..8156e19412b 100644 --- a/cpp/benchmarks/string/lengths.cpp +++ b/cpp/benchmarks/string/lengths.cpp @@ -25,15 +25,11 @@ static void bench_lengths(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state) NVBENCH_BENCH(bench_lengths) .set_name("lengths") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 105ae65cbe8..f6410aaef30 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); @@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state) NVBENCH_BENCH(bench_like) .set_name("strings_like") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {10, 25, 70, 100}); diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index 4dcf1314f83..69426a2d484 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -26,18 +26,14 @@ static void bench_replace(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const rtype = state.get_string("type"); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto program = cudf::strings::regex_program::create("(\\d+)"); @@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"replace", "backref"}); diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp index a2676609a40..e2e914cb350 100644 --- a/cpp/benchmarks/string/reverse.cpp +++ b/cpp/benchmarks/string/reverse.cpp @@ -25,15 +25,11 @@ static void bench_reverse(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state) NVBENCH_BENCH(bench_reverse) .set_name("reverse") - .add_int64_axis("row_width", {8, 16, 32, 64, 128}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index 1898f0340b6..c828a8ed0b0 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); @@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state) NVBENCH_BENCH(bench_slice) .set_name("slice") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"position", "multi"}); diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 9ef58daf0fc..9c7c27c4f07 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -28,16 +28,12 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); cudf::string_scalar target("+"); @@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"split", "split_ws", "record", "record_ws"}); diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp index 1fdb6e67109..34a7aa96e84 100644 --- a/cpp/benchmarks/string/split_re.cpp +++ b/cpp/benchmarks/string/split_re.cpp @@ -28,17 +28,13 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto prog = cudf::strings::regex_program::create("\\d+"); data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp deleted file mode 100644 index a34026281e8..00000000000 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -/** - * @brief Generate row count and row length argument ranges for a string benchmark. - * - * Generates a series of row count and row length arguments for string benchmarks. - * Combinations of row count and row length that would exceed the maximum string character - * column data length are not generated. - * - * @param b Benchmark to update with row count and row length arguments. - * @param min_rows Minimum row count argument to generate. - * @param max_rows Maximum row count argument to generate. - * @param rows_mult Row count multiplier to generate intermediate row count arguments. - * @param min_rowlen Minimum row length argument to generate. - * @param max_rowlen Maximum row length argument to generate. - * @param rowlen_mult Row length multiplier to generate intermediate row length arguments. - */ -inline void generate_string_bench_args(benchmark::internal::Benchmark* b, - int min_rows, - int max_rows, - int rows_mult, - int min_rowlen, - int max_rowlen, - int rowlen_mult) -{ - for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } - } -} diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 6ffa90edb8f..0ad1ae30f8c 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -27,15 +27,11 @@ static void bench_edit_distance(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input1(strings_table->view().column(0)); @@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state) NVBENCH_BENCH(bench_edit_distance) .set_name("edit_distance") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144}); diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp index 4e5daf83a3c..7577cf00c0f 100644 --- a/cpp/benchmarks/text/hash_ngrams.cpp +++ b/cpp/benchmarks/text/hash_ngrams.cpp @@ -27,16 +27,12 @@ static void bench_hash_ngrams(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const ngrams = static_cast(state.get_int64("ngrams")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); @@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state) NVBENCH_BENCH(bench_hash_ngrams) .set_name("hash_ngrams") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 2048}) + .add_int64_axis("num_rows", {16384, 32768, 262144}) .add_int64_axis("ngrams", {5, 10}); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index d5b74da6773..5506501138b 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -28,17 +28,13 @@ static void bench_jaccard(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const substring_width = static_cast(state.get_int64("substring_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const input_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); @@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state) NVBENCH_BENCH(bench_jaccard) .set_name("jaccard") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 1024, 2048}) .add_int64_axis("num_rows", {32768, 131072, 262144}) - .add_int64_axis("row_width", {128, 512, 1024, 2048}) .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index a80d0dcbdb8..8c86e8d4366 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 - ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) - : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); + auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width); }); } diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 71bccd80d39..594dc0de28a 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -28,16 +28,12 @@ static void bench_normalize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const normalize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp index 767ebab3eee..24ca4e5dfd7 100644 --- a/cpp/benchmarks/text/replace.cpp +++ b/cpp/benchmarks/text/replace.cpp @@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - std::vector words{" ", "one ", "two ", "three ", "four ", "five ", "six ", "sevén ", "eight ", "nine ", "ten ", "eleven ", "twelve ", "thirteen ", "fourteen ", @@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index dd8df695d3e..0b4e3bdefa5 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,6 @@ * limitations under the License. */ -#include -#include - #include #include @@ -24,6 +21,8 @@ #include +#include + #include #include #include @@ -54,40 +53,33 @@ static std::string create_hash_vocab_file() return hash_file; } -static void BM_subword_tokenizer(benchmark::State& state) +static void bench_subword_tokenizer(nvbench::state& state) { - auto const nrows = static_cast(state.range(0)); - std::vector h_strings(nrows, "This is a test "); + auto const num_rows = static_cast(state.get_int64("num_rows")); + + std::vector h_strings(num_rows, "This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); static std::string hash_file = create_hash_vocab_file(); std::vector offsets{14}; - uint32_t max_sequence_length = 64; - uint32_t stride = 48; - uint32_t do_truncate = 0; - uint32_t do_lower = 1; - // - auto vocab = nvtext::load_vocabulary_file(hash_file); - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - *vocab, - max_sequence_length, - stride, - do_lower, - do_truncate); - } -} + uint32_t max_sequence = 64; + uint32_t stride = 48; + uint32_t do_truncate = 0; + uint32_t do_lower = 1; -class Subword : public cudf::benchmark {}; + auto input = cudf::strings_column_view{strings}; -#define SUBWORD_BM_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \ - BENCHMARK_REGISTER_F(Subword, name) \ - ->RangeMultiplier(2) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows * max_sequence); -SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer); + auto vocab = nvtext::load_vocabulary_file(hash_file); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = + nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate); + }); +} -// BENCHMARK_MAIN(); +NVBENCH_BENCH(bench_subword_tokenizer) + .set_name("subword_tokenize") + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index e83310e0343..b9590c5539f 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -31,17 +31,13 @@ static void bench_tokenize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const tokenize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_tokenize) .set_name("tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"}); diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp index 523d277df18..0502f375d99 100644 --- a/cpp/benchmarks/text/vocab.cpp +++ b/cpp/benchmarks/text/vocab.cpp @@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state) { auto const stream = cudf::get_default_stream(); auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto const column = [num_rows, row_width] { + auto const column = [num_rows, min_width, max_width] { data_profile const profile = data_profile_builder().no_validity().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); return cudf::strings::filter_characters_of_type( cudf::strings_column_view(col->view()), @@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_vocab_tokenize) .set_name("vocab_tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp deleted file mode 100644 index adc3dddc59c..00000000000 --- a/cpp/benchmarks/text/word_minhash.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -#include - -#include - -static void bench_word_minhash(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); - auto const base64 = state.get_int64("hash_type") == 64; - - data_profile const strings_profile = - data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); - auto strings_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); - - auto const num_offsets = (num_rows / row_width) + 1; - auto offsets = cudf::sequence(num_offsets, - cudf::numeric_scalar(0), - cudf::numeric_scalar(row_width)); - - auto source = cudf::make_lists_column(num_offsets - 1, - std::move(offsets), - std::move(strings_table->release().front()), - 0, - rmm::device_buffer{}); - - data_profile const seeds_profile = data_profile_builder().no_validity().distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, 256); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - - cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); - auto chars_size = input.chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); - state.add_global_memory_writes(num_rows); // output are hashes - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) - : nvtext::word_minhash(source->view(), seeds.view()); - }); -} - -NVBENCH_BENCH(bench_word_minhash) - .set_name("word_minhash") - .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) - .add_int64_axis("row_width", {10, 100, 1000}) - .add_int64_axis("seed_count", {2, 25}) - .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index c440643037b..b0c48e04710 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,11 +14,6 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) - include(${rapids-cmake-dir}/cpm/package_override.cmake) - - set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") - rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") - if(NOT BUILD_SHARED_LIBS) set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) else() @@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow) nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git + GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb + GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all} ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake deleted file mode 100644 index 90b0f4d8a8e..00000000000 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ /dev/null @@ -1,27 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Use CPM to find or clone speedlog -function(find_and_configure_spdlog) - - include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog( - FMT_OPTION "EXTERNAL_FMT_HO" - INSTALL_EXPORT_SET cudf-exports - BUILD_EXPORT_SET cudf-exports - ) - -endfunction() - -find_and_configure_spdlog() diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index 2f29578f7ae..d5cadce40c2 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,11 +3,6 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", - "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff deleted file mode 100644 index e9a36fcb567..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff +++ /dev/null @@ -1,38 +0,0 @@ -diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h -index caa6be4..70ec8a2 100644 ---- a/src/nanoarrow/common/inline_buffer.h -+++ b/src/nanoarrow/common/inline_buffer.h -@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { - } - - static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { -- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | -+ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); -@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); -- bits[bytes_begin] &= only_byte_mask; -+ bits[bytes_begin] &= only_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte -- bits[bytes_begin] &= first_byte_mask; -+ bits[bytes_begin] &= first_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { -@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte -- *out_cursor = 0x00; -+ *out_cursor = 0x00; // NOLINT - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json deleted file mode 100644 index d529787e7c8..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json +++ /dev/null @@ -1,18 +0,0 @@ - -{ - "packages" : { - "nanoarrow" : { - "version" : "0.6.0.dev", - "git_url" : "https://github.com/apache/arrow-nanoarrow.git", - "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", - "git_shallow" : false, - "patches" : [ - { - "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", - "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", - "fixed_in" : "" - } - ] - } - } -} diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff deleted file mode 100644 index 9f68d85e7db..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ /dev/null @@ -1,22 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 3d004aa55..71ce86bea 100644 ---- a/thrust/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -63,7 +63,7 @@ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1) \ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2) - --#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE) -+#if 0 - //! @brief Always dispatches to 64 bit offset version of an algorithm - # define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ -@@ -89,7 +89,7 @@ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ - _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments) - --#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE) -+#elif 1 - - //! @brief Ensures that the size of the input does not overflow the offset type - # define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count) \ diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 1c1052487f2..5032a073b58 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable, in between the two settings will be excluded from the written log. The available levels are the same as for the CMake variable. * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime. -For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that +For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that are not errors or critical errors. This API should not be used within libcudf to manipulate logging, its purpose is to allow upstream users to configure libcudf logging to fit their application. By default, logging messages are output to stderr. Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the specified path (can be relative to the current directory). -Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to -standard output or even a custom spdlog sink. +Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to +standard output. # Data Types diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index f5f514d26d9..a1b7db5e08a 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -110,8 +110,9 @@ class aggregation { COLLECT_SET, ///< collect values into a list without duplicate entries LEAD, ///< window function, accesses row at specified offset following current row LAG, ///< window function, accesses row at specified offset preceding current row - PTX, ///< PTX UDF based reduction - CUDA, ///< CUDA UDF based reduction + PTX, ///< PTX based UDF aggregation + CUDA, ///< CUDA based UDF aggregation + HOST_UDF, ///< host based UDF aggregation MERGE_LISTS, ///< merge multiple lists values into one list MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries MERGE_M2, ///< merge partial values of M2 aggregation, @@ -120,7 +121,7 @@ class aggregation { TDIGEST, ///< create a tdigest from a set of input values MERGE_TDIGEST, ///< create a tdigest by merging multiple tdigests together HISTOGRAM, ///< compute frequency of each element - MERGE_HISTOGRAM ///< merge partial values of HISTOGRAM aggregation, + MERGE_HISTOGRAM ///< merge partial values of HISTOGRAM aggregation }; aggregation() = delete; @@ -599,6 +600,18 @@ std::unique_ptr make_udf_aggregation(udf_type type, std::string const& user_defined_aggregator, data_type output_type); +// Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation. +struct host_udf_base; + +/** + * @brief Factory to create a HOST_UDF aggregation. + * + * @param host_udf An instance of a class derived from `host_udf_base` to perform aggregation + * @return A HOST_UDF aggregation object + */ +template +std::unique_ptr make_host_udf_aggregation(std::unique_ptr host_udf); + /** * @brief Factory to create a MERGE_LISTS aggregation. * diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp new file mode 100644 index 00000000000..bbce76dc5f3 --- /dev/null +++ b/cpp/include/cudf/aggregation/host_udf.hpp @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +/** + * @file host_udf.hpp + * @brief Declare the base class for host-side user-defined function (`HOST_UDF`) and example of + * subclass implementation. + */ + +namespace CUDF_EXPORT cudf { +/** + * @addtogroup aggregation_factories + * @{ + */ + +/** + * @brief The interface for host-based UDF implementation. + * + * An implementation of host-based UDF needs to be derived from this base class, defining + * its own version of the required functions. In particular: + * - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`, + * and `clone` functions. + * - If necessary, the derived class can also override `do_hash` to compute hashing for its + * instance, and `get_required_data` to selectively access to the input data as well as + * intermediate data provided by libcudf. + * + * Example of such implementation: + * @code{.cpp} + * struct my_udf_aggregation : cudf::host_udf_base { + * my_udf_aggregation() = default; + * + * // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`, + * // and the result from groupby `MAX` aggregation. + * [[nodiscard]] data_attribute_set_t get_required_data() const override + * { + * return {groupby_data_attribute::GROUPED_VALUES, + * groupby_data_attribute::GROUP_OFFSETS, + * cudf::make_max_aggregation()}; + * } + * + * [[nodiscard]] output_t get_empty_output( + * [[maybe_unused]] std::optional output_dtype, + * [[maybe_unused]] rmm::cuda_stream_view stream, + * [[maybe_unused]] rmm::device_async_resource_ref mr) const override + * { + * // This UDF aggregation always returns a column of type INT32. + * return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + * } + * + * [[nodiscard]] output_t operator()(input_map_t const& input, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform UDF computation using the input data and return the result. + * } + * + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override + * { + * // Check if the other object is also instance of this class. + * return dynamic_cast(&other) != nullptr; + * } + * + * [[nodiscard]] std::unique_ptr clone() const override + * { + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct host_udf_base { + host_udf_base() = default; + virtual ~host_udf_base() = default; + + /** + * @brief Define the possible data needed for groupby aggregations. + * + * Note that only sort-based groupby aggregations are supported. + */ + enum class groupby_data_attribute : int32_t { + INPUT_VALUES, ///< The input values column. + GROUPED_VALUES, ///< The input values grouped according to the input `keys` for which the + ///< values within each group maintain their original order. + SORTED_GROUPED_VALUES, ///< The input values grouped according to the input `keys` and + ///< sorted within each group. + NUM_GROUPS, ///< The number of groups (i.e., number of distinct keys). + GROUP_OFFSETS, ///< The offsets separating groups. + GROUP_LABELS ///< Group labels (which is also the same as group indices). + }; + + /** + * @brief Describe possible data that may be needed in the derived class for its operations. + * + * Such data can be either intermediate data such as sorted values or group labels etc, or the + * results of other aggregations. + * + * Each derived host-based UDF class may need a different set of data. It is inefficient to + * evaluate and pass down all these possible data at once from libcudf. A solution for that is, + * the derived class can define a subset of data that it needs and libcudf will evaluate + * and pass down only data requested from that set. + */ + struct data_attribute { + /** + * @brief Hold all possible data types for the input of the aggregation in the derived class. + */ + using value_type = std::variant>; + value_type value; ///< The actual data attribute, wrapped by this struct + ///< as a wrapper is needed to define `hash` and `equal_to` functors. + + data_attribute() = default; ///< Default constructor + data_attribute(data_attribute&&) = default; ///< Move constructor + + /** + * @brief Construct a new data attribute from an aggregation attribute. + * @param value_ An aggregation attribute + */ + template )> + data_attribute(T value_) : value{value_} + { + } + + /** + * @brief Construct a new data attribute from another aggregation request. + * @param value_ An aggregation request + */ + template || + std::is_same_v)> + data_attribute(std::unique_ptr value_) : value{std::move(value_)} + { + CUDF_EXPECTS(std::get>(value) != nullptr, + "Invalid aggregation request."); + if constexpr (std::is_same_v) { + CUDF_EXPECTS( + dynamic_cast(std::get>(value).get()) != nullptr, + "Requesting results from other aggregations is only supported in groupby " + "aggregations."); + } + } + + /** + * @brief Copy constructor. + * @param other The other data attribute to copy from + */ + data_attribute(data_attribute const& other); + + /** + * @brief Hash functor for `data_attribute`. + */ + struct hash { + /** + * @brief Compute the hash value of a data attribute. + * @param attr The data attribute to hash + * @return The hash value of the data attribute + */ + std::size_t operator()(data_attribute const& attr) const; + }; // struct hash + + /** + * @brief Equality comparison functor for `data_attribute`. + */ + struct equal_to { + /** + * @brief Check if two data attributes are equal. + * @param lhs The left-hand side data attribute + * @param rhs The right-hand side data attribute + * @return True if the two data attributes are equal + */ + bool operator()(data_attribute const& lhs, data_attribute const& rhs) const; + }; // struct equal_to + }; // struct data_attribute + + /** + * @brief Set of attributes for the input data that is needed for computing the aggregation. + */ + using data_attribute_set_t = + std::unordered_set; + + /** + * @brief Return a set of attributes for the data that is needed for computing the aggregation. + * + * The derived class should return the attributes corresponding to only the data that it needs to + * avoid unnecessary computation performed in libcudf. If this function is not overridden, an + * empty set is returned. That means all the data attributes (except results from other + * aggregations in groupby) will be needed. + * + * @return A set of `data_attribute` + */ + [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; } + + /** + * @brief Hold all possible types of the data that is passed to the derived class for executing + * the aggregation. + */ + using input_data_t = std::variant>; + + /** + * @brief Input to the aggregation, mapping from each data attribute to its actual data. + */ + using input_map_t = std:: + unordered_map; + + /** + * @brief Output type of the aggregation. + * + * Currently only a single type is supported as the output of the aggregation, but it will hold + * more type in the future when reduction is supported. + */ + using output_t = std::variant>; + + /** + * @brief Get the output when the input values column is empty. + * + * This is called in libcudf when the input values column is empty. In such situations libcudf + * tries to generate the output directly without unnecessarily evaluating the intermediate data. + * + * @param output_dtype The expected output data type + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation when input values is empty + */ + [[nodiscard]] virtual output_t get_empty_output(std::optional output_dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; + + /** + * @brief Perform the main computation for the host-based UDF. + * + * @param input The input data needed for performing all computation + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation + */ + [[nodiscard]] virtual output_t operator()(input_map_t const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; + + /** + * @brief Computes hash value of the class's instance. + * @return The hash value of the instance + */ + [[nodiscard]] virtual std::size_t do_hash() const + { + return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + } + + /** + * @brief Compares two instances of the derived class for equality. + * @param other The other derived class's instance to compare with + * @return True if the two instances are equal + */ + [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + + /** + * @brief Clones the instance. + * + * A class derived from `host_udf_base` should not store too much data such that its instances + * remain lightweight for efficient cloning. + * + * @return A new instance cloned from this + */ + [[nodiscard]] virtual std::unique_ptr clone() const = 0; +}; + +/** @} */ // end of group +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index db6d5255616..aacb5ccfede 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -33,11 +33,13 @@ #include #include +#include #include #include #include #include +#include /** * @file column_device_view.cuh @@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf { * */ struct nullate { - struct YES : std::bool_constant {}; - struct NO : std::bool_constant {}; + struct YES : cuda::std::bool_constant {}; + struct NO : cuda::std::bool_constant {}; /** * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than * compile time. The calling code is responsible for specifying whether or not nulls are @@ -80,7 +82,7 @@ struct nullate { * @return `true` if nulls are expected in the operation in which this object is applied, * otherwise false */ - constexpr operator bool() const noexcept { return value; } + CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; } bool value; ///< True if nulls are expected }; }; @@ -319,14 +321,14 @@ class alignas(16) column_device_view_base { } template - struct has_element_accessor_impl : std::false_type {}; + struct has_element_accessor_impl : cuda::std::false_type {}; template struct has_element_accessor_impl< C, T, - void_t().template element(std::declval()))>> - : std::true_type {}; + void_t().template element(cuda::std::declval()))>> + : cuda::std::true_type {}; }; // @cond // Forward declaration @@ -442,7 +444,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return string_view instance representing this element at this index */ template )> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset char const* d_strings = static_cast(_data); @@ -501,7 +503,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return dictionary32 instance representing this element at this index */ template )> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset auto const indices = d_children[0]; @@ -519,7 +521,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return numeric::fixed_point representing the element at this index */ template ())> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { using namespace numeric; using rep = typename T::rep; @@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return `true` if `column_device_view::element()` has a valid overload, `false` otherwise */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } @@ -1032,7 +1034,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return Reference to the element at the specified index */ template ())> - __device__ [[nodiscard]] T& element(size_type element_index) const noexcept + [[nodiscard]] __device__ T& element(size_type element_index) const noexcept { return data()[element_index]; } @@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return `true` if `mutable_column_device_view::element()` has a valid overload, `false` */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } @@ -1425,13 +1427,13 @@ struct pair_rep_accessor { private: template , void>* = nullptr> - __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const + [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const + [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index de53e7586cd..59011f7b138 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -31,12 +32,11 @@ #include #include -#include namespace cudf { namespace detail { template -constexpr bool is_product_supported() +CUDF_HOST_DEVICE constexpr bool is_product_supported() { return is_numeric(); } @@ -216,12 +216,12 @@ struct identity_initializer { * @throw cudf::logic_error if column type is not fixed-width * * @param table The table of columns to initialize. - * @param aggs A vector of aggregation operations corresponding to the table + * @param aggs A span of aggregation operations corresponding to the table * columns. The aggregations determine the identity value for each column. * @param stream CUDA stream used for device memory operations and kernel launches. */ void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 6661a461b8b..d873e93bd20 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,8 @@ class simple_aggregations_collector { // Declares the interface for the simple class lead_lag_aggregation const& agg); virtual std::vector> visit(data_type col_type, class udf_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class host_udf_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_lists_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -135,6 +138,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class collect_set_aggregation const& agg); virtual void visit(class lead_lag_aggregation const& agg); virtual void visit(class udf_aggregation const& agg); + virtual void visit(class host_udf_aggregation const& agg); virtual void visit(class merge_lists_aggregation const& agg); virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class merge_m2_aggregation const& agg); @@ -960,6 +964,35 @@ class udf_aggregation final : public rolling_aggregation { } }; +/** + * @brief Derived class for specifying host-based UDF aggregation. + */ +class host_udf_aggregation final : public groupby_aggregation { + public: + std::unique_ptr udf_ptr; + + host_udf_aggregation() = delete; + host_udf_aggregation(host_udf_aggregation const&) = delete; + + // Need to define the constructor and destructor in a separate source file where we have the + // complete declaration of `host_udf_base`. + explicit host_udf_aggregation(std::unique_ptr udf_ptr_); + ~host_udf_aggregation() override; + + [[nodiscard]] bool is_equal(aggregation const& _other) const override; + + [[nodiscard]] size_t do_hash() const override; + + [[nodiscard]] std::unique_ptr clone() const override; + + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived aggregation class for specifying MERGE_LISTS aggregation */ @@ -1462,6 +1495,12 @@ struct target_type_impl +struct target_type_impl { + // Just a placeholder. The actual return type is unknown. + using type = struct_view; +}; + /** * @brief Helper alias to get the accumulator type for performing aggregation * `k` on elements of type `Source` @@ -1579,6 +1618,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind return f.template operator()(std::forward(args)...); case aggregation::EWMA: return f.template operator()(std::forward(args)...); + case aggregation::HOST_UDF: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 4159e324472..9226697a7f6 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -16,300 +16,25 @@ #pragma once -#include -#include #include -#include #include #include -#include -#include -#include -#include #include #include #include #include -#include #include -#include -#include #include -#include #include #include -#include -#include #include #include -#include - namespace cudf { namespace detail { -// Compute the count of elements that pass the mask within each block -template -CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - int count = 0; - - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - count += __syncthreads_count(mask_true); - tid += block_size; - } - - if (threadIdx.x == 0) block_counts[blockIdx.x] = count; -} - -// Compute the exclusive prefix sum of each thread's mask value within each block -template -__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum) -{ - int offset = 0; - - using BlockScan = cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum); - - return offset; -} - -// This kernel scatters data and validity mask of a column based on the -// scan of the boolean mask. The block offsets for the scan are already computed. -// Just compute the scan of the mask in each block and add it to the block's -// output offset. This is the output index of each element. Scattering -// the valid mask is not as easy, because each thread is only responsible for -// one bit. Warp-level processing (ballot) makes this simpler. -// To make scattering efficient, we "coalesce" the block's scattered data and -// valids in shared memory, and then write from shared memory to global memory -// in a contiguous manner. -// The has_validity template parameter specializes this kernel for the -// non-nullable case for performance without writing another kernel. -// -// Note: `filter` is not run on indices larger than the input column size -template -__launch_bounds__(block_size) CUDF_KERNEL - void scatter_kernel(cudf::mutable_column_device_view output_view, - cudf::size_type* output_null_count, - cudf::column_device_view input_view, - cudf::size_type const* __restrict__ block_offsets, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - T* __restrict__ output_data = output_view.data(); - cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask(); - static_assert(block_size <= 1024, "Maximum thread block size exceeded"); - - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - cudf::size_type block_offset = block_offsets[blockIdx.x]; - - // one extra warp worth in case the block is not aligned - __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1]; - __shared__ T temp_data[block_size]; - - cudf::size_type warp_valid_counts{0}; // total valid sum over the `per_thread` loop below - cudf::size_type block_sum = 0; // count passing filter over the `per_thread` loop below - - // Note that since the maximum gridDim.x on all supported GPUs is as big as - // cudf::size_type, this loop is sufficient to cover our maximum column size - // regardless of the value of block_size and per_thread. - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - - cudf::size_type tmp_block_sum = 0; - // get output location using a scan of the mask result - cudf::size_type const local_index = block_scan_mask(mask_true, tmp_block_sum); - block_sum += tmp_block_sum; - - if (has_validity) { - temp_valids[threadIdx.x] = false; // init shared memory - if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false; - __syncthreads(); // wait for init - } - - if (mask_true) { - temp_data[local_index] = input_view.data()[tid]; // scatter data to shared - - // scatter validity mask to shared memory - if (has_validity and input_view.is_valid(tid)) { - // determine aligned offset for this warp's output - cudf::size_type const aligned_offset = block_offset % cudf::detail::warp_size; - temp_valids[local_index + aligned_offset] = true; - } - } - - __syncthreads(); // wait for shared data and validity mask to be complete - - // Copy output data coalesced from shared to global - if (threadIdx.x < tmp_block_sum) - output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x]; - - if (has_validity) { - // Since the valid bools are contiguous in shared memory now, we can use - // __popc to combine them into a single mask element. - // Then, most mask elements can be directly copied from shared to global - // memory. Only the first and last 32-bit mask elements of each block must - // use an atomicOr, because these are where other blocks may overlap. - - constexpr int num_warps = block_size / cudf::detail::warp_size; - // account for partial blocks with non-warp-aligned offsets - int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1; - int const last_warp = min(num_warps, last_index / cudf::detail::warp_size); - int const wid = threadIdx.x / cudf::detail::warp_size; - int const lane = threadIdx.x % cudf::detail::warp_size; - - cudf::size_type tmp_warp_valid_counts{0}; - - if (tmp_block_sum > 0 && wid <= last_warp) { - int valid_index = (block_offset / cudf::detail::warp_size) + wid; - - // compute the valid mask for this warp - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]); - - // Note the atomicOr's below assume that output_valid has been set to - // all zero before the kernel - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts = __popc(valid_warp); - if (wid > 0 && wid < last_warp) - output_valid[valid_index] = valid_warp; - else { - cuda::atomic_ref ref{ - output_valid[valid_index]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - - // if the block is full and not aligned then we have one more warp to cover - if ((wid == 0) && (last_warp == num_warps)) { - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]); - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts += __popc(valid_warp); - cuda::atomic_ref ref{ - output_valid[valid_index + num_warps]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - } - warp_valid_counts += tmp_warp_valid_counts; - } - - block_offset += tmp_block_sum; - tid += block_size; - } - // Compute total null_count for this block and add it to global count - constexpr cudf::size_type leader_lane{0}; - cudf::size_type block_valid_count = - cudf::detail::single_lane_block_sum_reduce(warp_valid_counts); - - if (threadIdx.x == 0) { // one thread computes and adds to null count - cuda::atomic_ref ref{*output_null_count}; - ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed); - } -} - -template -struct DeviceType { - using type = T; -}; - -template -struct DeviceType()>> { - using type = typename T::rep; -}; - -template -struct DeviceType()>> { - using type = typename cudf::device_storage_type_t; -}; - -// Dispatch functor which performs the scatter for fixed column types and gather for other -template -struct scatter_gather_functor { - template ()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const* block_offsets, - Filter filter, - cudf::size_type per_thread, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - auto output_column = - cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr); - auto output = output_column->mutable_view(); - - bool has_valid = input.nullable(); - - using Type = typename DeviceType::type; - - auto scatter = (has_valid) ? scatter_kernel - : scatter_kernel; - - cudf::detail::grid_1d grid{input.size(), block_size, per_thread}; - - cudf::detail::device_scalar null_count{0, stream}; - if (output.nullable()) { - // Have to initialize the output mask to all zeros because we may update - // it with atomicOr(). - CUDF_CUDA_TRY(cudaMemsetAsync(static_cast(output.null_mask()), - 0, - cudf::bitmask_allocation_size_bytes(output.size()), - stream.value())); - } - - auto output_device_view = cudf::mutable_column_device_view::create(output, stream); - auto input_device_view = cudf::column_device_view::create(input, stream); - scatter<<>>(*output_device_view, - null_count.data(), - *input_device_view, - block_offsets, - input.size(), - per_thread, - filter); - - if (has_valid) { output_column->set_null_count(null_count.value(stream)); } - return output_column; - } - - template () and !cudf::is_fixed_point()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const*, - Filter filter, - cudf::size_type, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - rmm::device_uvector indices(output_size, stream); - - thrust::copy_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - indices.begin(), - filter); - - auto output_table = cudf::detail::gather(cudf::table_view{{input}}, - indices, - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - - // There will be only one column - return std::make_unique(std::move(output_table->get_column(0))); - } -}; - /** * @brief Filters `input` using a Filter function object * @@ -319,9 +44,11 @@ struct scatter_gather_functor { * false otherwise. * * @tparam Filter the filter functor type - * @param[in] input The table_view to filter - * @param[in] filter A function object that takes an index and returns a bool - * @return unique_ptr The table generated from filtered `input`. + * @param input The table_view to filter + * @param filter A function object that takes an index and returns a bool + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for allocating the returned memory + * @return The table generated from filtered `input` */ template std::unique_ptr
copy_if(table_view const& input, @@ -333,76 +60,22 @@ std::unique_ptr
copy_if(table_view const& input, if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); } - constexpr int block_size = 256; - cudf::size_type per_thread = - elements_per_thread(compute_block_counts, input.num_rows(), block_size); - cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread}; - - // temp storage for block counts and offsets - rmm::device_uvector block_counts(grid.num_blocks, stream); - rmm::device_uvector block_offsets(grid.num_blocks + 1, stream); - - // 1. Find the count of elements in each block that "pass" the mask - compute_block_counts<<>>( - block_counts.begin(), input.num_rows(), per_thread, filter); - - // initialize just the first element of block_offsets to 0 since the InclusiveSum below - // starts at the second element. - CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value())); - - // 2. Find the offset for each block's output using a scan of block counts - if (grid.num_blocks > 1) { - // Determine and allocate temporary device storage - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(nullptr, - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - // Run exclusive prefix sum - cub::DeviceScan::InclusiveSum(d_temp_storage.data(), - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - } - - // As it is InclusiveSum, last value in block_offsets will be output_size - // unless num_blocks == 1, in which case output_size is just block_counts[0] - cudf::size_type output_size{0}; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &output_size, - grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(), - sizeof(cudf::size_type), - cudaMemcpyDefault, - stream.value())); + auto indices = rmm::device_uvector(input.num_rows(), stream); + auto const begin = thrust::counting_iterator(0); + auto const end = begin + input.num_rows(); + auto const indices_end = + thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter); - stream.synchronize(); + auto const output_size = static_cast(thrust::distance(indices.begin(), indices_end)); - if (output_size == input.num_rows()) { - return std::make_unique
(input, stream, mr); - } else if (output_size > 0) { - std::vector> out_columns(input.num_columns()); - std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) { - return cudf::type_dispatcher(col_view.type(), - scatter_gather_functor{}, - col_view, - output_size, - block_offsets.begin(), - filter, - per_thread, - stream, - mr); - }); + // nothing selected + if (output_size == 0) { return empty_like(input); } + // everything selected + if (output_size == input.num_rows()) { return std::make_unique
(input, stream, mr); } - return std::make_unique
(std::move(out_columns)); - } else { - return empty_like(input); - } + auto const map = device_span(indices.data(), output_size); + return cudf::detail::gather( + input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr); } } // namespace detail diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 5dc75b1a3fb..a7efb4e6e93 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - int const warp_id = tidx / cudf::detail::warp_size; - size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const warp_id = tidx / cudf::detail::warp_size; + auto const warps_per_grid = stride / cudf::detail::warp_size; // begin/end indices for the column data size_type const begin = 0; @@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % cudf::detail::warp_size; + auto const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp index 16ca06c6561..090dc8b62b6 100644 --- a/cpp/include/cudf/detail/device_scalar.hpp +++ b/cpp/include/cudf/detail/device_scalar.hpp @@ -78,7 +78,7 @@ class device_scalar : public rmm::device_scalar { [[nodiscard]] T value(rmm::cuda_stream_view stream) const { cuda_memcpy(bounce_buffer, device_span{this->data(), 1}, stream); - return bounce_buffer[0]; + return std::move(bounce_buffer[0]); } void set_value_async(T const& value, rmm::cuda_stream_view stream) diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh index 2acc10105cf..9a10163eb15 100644 --- a/cpp/include/cudf/detail/distinct_hash_join.cuh +++ b/cpp/include/cudf/detail/distinct_hash_join.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,19 +36,24 @@ using cudf::experimental::row::lhs_index_type; using cudf::experimental::row::rhs_index_type; /** - * @brief An comparator adapter wrapping both self comparator and two table comparator + * @brief A custom comparator used for the build table insertion */ -template -struct comparator_adapter { - comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} - - __device__ constexpr auto operator()( +struct always_not_equal { + __device__ constexpr bool operator()( cuco::pair const&, cuco::pair const&) const noexcept { // All build table keys are distinct thus `false` no matter what return false; } +}; + +/** + * @brief An comparator adapter wrapping the two table comparator + */ +template +struct comparator_adapter { + comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} __device__ constexpr auto operator()( cuco::pair const& lhs, @@ -62,56 +67,14 @@ struct comparator_adapter { Equal _d_equal; }; -template -struct hasher_adapter { - hasher_adapter(Hasher const& d_hasher = {}) : _d_hasher{d_hasher} {} - - template - __device__ constexpr auto operator()(cuco::pair const& key) const noexcept - { - return _d_hasher(key.first); - } - - private: - Hasher _d_hasher; -}; - /** * @brief Distinct hash join that builds hash table in creation and probes results in subsequent * `*_join` member functions. * - * @tparam HasNested Flag indicating whether there are nested columns in build/probe table + * This class enables the distinct hash join scheme that builds hash table once, and probes as many + * times as needed (possibly in parallel). */ -template -struct distinct_hash_join { - private: - /// Device row equal type - using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter< - cudf::experimental::row::equality::device_row_comparator>; - using hasher = hasher_adapter>; - using probing_scheme_type = cuco::linear_probing<1, hasher>; - using cuco_storage_type = cuco::storage<1>; - - /// Hash table type - using hash_table_type = cuco::static_set, - cuco::extent, - cuda::thread_scope_device, - comparator_adapter, - probing_scheme_type, - cudf::detail::cuco_allocator, - cuco_storage_type>; - - bool _has_nulls; ///< true if nulls are present in either build table or probe table - cudf::null_equality _nulls_equal; ///< whether to consider nulls as equal - cudf::table_view _build; ///< input table to build the hash map - cudf::table_view _probe; ///< input table to probe the hash map - std::shared_ptr - _preprocessed_build; ///< input table preprocssed for row operators - std::shared_ptr - _preprocessed_probe; ///< input table preprocssed for row operators - hash_table_type _hash_table; ///< hash table built on `_build` - +class distinct_hash_join { public: distinct_hash_join() = delete; ~distinct_hash_join() = default; @@ -120,21 +83,28 @@ struct distinct_hash_join { distinct_hash_join& operator=(distinct_hash_join const&) = delete; distinct_hash_join& operator=(distinct_hash_join&&) = delete; + /** + * @brief Hasher adapter used by distinct hash join + */ + struct hasher { + template + __device__ constexpr hash_value_type operator()( + cuco::pair const& key) const noexcept + { + return key.first; + } + }; + /** * @brief Constructor that internally builds the hash table based on the given `build` table. * * @throw cudf::logic_error if the number of columns in `build` table is 0. * * @param build The build table, from which the hash table is built - * @param probe The probe table - * @param has_nulls Flag to indicate if any nulls exist in the `build` table or - * any `probe` table that will be used later for join. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches. */ distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - bool has_nulls, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream); @@ -143,12 +113,36 @@ struct distinct_hash_join { */ std::pair>, std::unique_ptr>> - inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; /** * @copydoc cudf::distinct_hash_join::left_join */ std::unique_ptr> left_join( - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + private: + using probing_scheme_type = cuco::linear_probing<1, hasher>; + using cuco_storage_type = cuco::storage<1>; + + /// Hash table type + using hash_table_type = cuco::static_set, + cuco::extent, + cuda::thread_scope_device, + always_not_equal, + probing_scheme_type, + cudf::detail::cuco_allocator, + cuco_storage_type>; + + bool _has_nested_columns; ///< True if nested columns are present in build and probe tables + cudf::null_equality _nulls_equal; ///< Whether to consider nulls as equal + cudf::table_view _build; ///< Input table to build the hash map + std::shared_ptr + _preprocessed_build; ///< Input table preprocssed for row operators + hash_table_type _hash_table; ///< Hash table built on `_build` }; } // namespace cudf::detail diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 5ea0d06039f..1bfb40e5916 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre CUDF_EXPECTS(data_type(type_to_id()) == col_view.type(), "get_value data type mismatch"); CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); - T result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &result, col_view.data() + element_index, sizeof(T), cudaMemcpyDefault, stream.value())); - stream.synchronize(); - return result; + return cudf::detail::make_host_vector_sync( + device_span{col_view.data() + element_index, 1}, stream) + .front(); } } // namespace detail diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 61a8e9f7ec3..72cdc3d8067 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -74,9 +74,10 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type global_thread_id(thread_index_type thread_id, - thread_index_type block_id, - thread_index_type num_threads_per_block) + __device__ static constexpr thread_index_type global_thread_id( + thread_index_type thread_id, + thread_index_type block_id, + thread_index_type num_threads_per_block) { return thread_id + block_id * num_threads_per_block; } @@ -114,8 +115,8 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, - thread_index_type num_blocks_per_grid) + __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, + thread_index_type num_blocks_per_grid) { return num_threads_per_block * num_blocks_per_grid; } diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index 46f424e051b..923cd04479d 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,8 @@ #include #include +#include + #include namespace cudf { @@ -42,7 +44,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs) { - return std::min(lhs, rhs); + return cuda::std::min(lhs, rhs); } /** @@ -53,7 +55,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs) { - return std::max(lhs, rhs); + return cuda::std::max(lhs, rhs); } } // namespace detail @@ -68,22 +70,26 @@ struct DeviceSum { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{typename T::duration{0}}; } template () && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{0}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support device operator identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support device operator identity"); +#endif return T{}; } }; @@ -105,7 +111,7 @@ struct DeviceCount { } template - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{}; } @@ -125,7 +131,7 @@ struct DeviceMin { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::max() // https://eel.is/c++draft/numeric.limits.general#6 @@ -139,9 +145,13 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMin identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity"); +#endif return cuda::std::numeric_limits::max(); } @@ -153,7 +163,7 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::max_value()); } @@ -173,7 +183,7 @@ struct DeviceMax { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::min() // https://eel.is/c++draft/numeric.limits.general#6 @@ -187,9 +197,13 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMax identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity"); +#endif return cuda::std::numeric_limits::lowest(); } @@ -200,7 +214,7 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::lowest_value()); } @@ -217,15 +231,19 @@ struct DeviceProduct { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{1}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceProduct identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity"); +#endif return T{1, numeric::scale_type{0}}; } }; diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 8b709f2a8f8..2e3d71815c0 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept * `modulus` is positive and does not check for overflow. */ template -constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept +CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant, I dividend, } // namespace detail /** - * Divides the left-hand-side by the right-hand-side, rounding up + * @brief Divides the left-hand-side by the right-hand-side, rounding up * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. * - * @param dividend the number to divide - * @param divisor the number of by which to divide - * @return The least integer multiple of {@link divisor} which is greater than or equal to - * the non-integral division dividend/divisor. + * The result is undefined if `divisor == 0` or + * if `divisor == -1` and `dividend == min()`. + * + * Will not overflow, and may _or may not_ be slower than the intuitive + * approach of using `(dividend + divisor - 1) / divisor`. * - * @note will not overflow, and may _or may not_ be slower than the intuitive - * approach of using (dividend + divisor - 1) / divisor + * @tparam I Integer type for `dividend`, `divisor`, and the return type + * @param dividend The number to divide + * @param divisor The number by which to divide + * @return The least integer multiple of `divisor` which is greater than or equal to + * the non-integral division `dividend/divisor` */ template constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept @@ -183,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept * @return Absolute value if value type is signed. */ template -constexpr auto absolute_value(T value) -> T +CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T { if constexpr (cuda::std::is_signed()) return numeric::detail::abs(value); return value; diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp deleted file mode 100644 index e7643eb44bd..00000000000 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -// Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp index fce08b4a5c4..9e68bafb09a 100644 --- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -183,7 +184,7 @@ struct floating_converter { * @param integer_rep The bit-casted floating value to extract the exponent from * @return The stored base-2 exponent and significand, shifted for denormals */ - CUDF_HOST_DEVICE inline static std::pair get_significand_and_pow2( + CUDF_HOST_DEVICE inline static cuda::std::pair get_significand_and_pow2( IntegralType integer_rep) { // Extract the significand @@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** @@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index ea2f5d4b6ca..5edbb322231 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,7 +60,7 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 }; * @return `true` if the type is supported by `fixed_point` implementation */ template -constexpr inline auto is_supported_representation_type() +CUDF_HOST_DEVICE constexpr inline auto is_supported_representation_type() { return cuda::std::is_same_v || // cuda::std::is_same_v || // @@ -72,6 +72,24 @@ constexpr inline auto is_supported_representation_type() // Helper functions for `fixed_point` type namespace detail { +/** + * @brief Returns the smaller of the given scales + * + * @param a The left-hand side value to compare + * @param b The right-hand side value to compare + * @return The smaller of the given scales + */ +CUDF_HOST_DEVICE constexpr inline scale_type min(scale_type const& a, scale_type const& b) +{ + // TODO This is a temporary workaround because is not self-contained when + // built with NVRTC 11.8. Replace this with cuda::std::min once the underlying issue is resolved. +#ifdef __CUDA_ARCH__ + return scale_type{min(static_cast(a), static_cast(b))}; +#else + return std::min(a, b); +#endif +} + /** * @brief A function for integer exponentiation by squaring. * @@ -267,12 +285,12 @@ class fixed_point { * @return The `fixed_point` number in base 10 (aka human readable format) */ template >* = nullptr> - explicit constexpr operator U() const + CUDF_HOST_DEVICE explicit constexpr operator U() const { // Cast to the larger of the two types (of U and Rep) before converting to Rep because in // certain cases casting to U before shifting will result in integer overflow (i.e. if U = // int32_t, Rep = int64_t and _value > 2 billion) - auto const value = std::common_type_t(_value); + auto const value = cuda::std::common_type_t(_value); return static_cast(detail::shift(value, scale_type{-_scale})); } @@ -669,7 +687,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator+(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const sum = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value; #if defined(__CUDACC_DEBUG__) @@ -687,7 +705,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator-(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const diff = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value; #if defined(__CUDACC_DEBUG__) @@ -735,7 +753,7 @@ template CUDF_HOST_DEVICE inline bool operator==(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value; } @@ -744,7 +762,7 @@ template CUDF_HOST_DEVICE inline bool operator!=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value; } @@ -753,7 +771,7 @@ template CUDF_HOST_DEVICE inline bool operator<=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value; } @@ -762,7 +780,7 @@ template CUDF_HOST_DEVICE inline bool operator>=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value; } @@ -771,7 +789,7 @@ template CUDF_HOST_DEVICE inline bool operator<(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value; } @@ -780,7 +798,7 @@ template CUDF_HOST_DEVICE inline bool operator>(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value; } @@ -789,7 +807,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator%(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const remainder = lhs.rescaled(scale)._value % rhs.rescaled(scale)._value; return fixed_point{scaled_integer{remainder, scale}}; } diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 307a52cd242..88034b4f804 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,6 +166,26 @@ std::unique_ptr sha512( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Computes the XXHash_32 hash value of each row in the given table + * + * This function computes the hash of each column using the `seed` for the first column + * and the resulting hash as a seed for the next column and so on. + * The result is a uint32 value for each row. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr xxhash_32( + table_view const& input, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Computes the XXHash_64 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh index 0ec41a20ef1..fd3455e761d 100644 --- a/cpp/include/cudf/hashing/detail/hash_functions.cuh +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -18,7 +18,8 @@ #include -#include +#include +#include namespace cudf::hashing::detail { @@ -29,7 +30,7 @@ template T __device__ inline normalize_nans(T const& key) { if constexpr (cudf::is_floating_point()) { - if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } + if (cuda::std::isnan(key)) { return cuda::std::numeric_limits::quiet_NaN(); } } return key; } diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index a978e54a1b9..f796ff4526e 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,6 +61,11 @@ std::unique_ptr sha512(table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +std::unique_ptr xxhash_32(table_view const& input, + uint64_t seed, + rmm::cuda_stream_view, + rmm::device_async_resource_ref mr); + std::unique_ptr xxhash_64(table_view const& input, uint64_t seed, rmm::cuda_stream_view, @@ -82,7 +87,7 @@ std::unique_ptr xxhash_64(table_view const& input, * @param rhs The second hash value * @return Combined hash value */ -constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) +CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) { return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2)); } diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh index e0c7ce840d7..69edf38e359 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh @@ -57,62 +57,71 @@ struct MurmurHash3_x86_32 { }; template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()(bool const& key) const +MurmurHash3_x86_32::result_type __device__ inline MurmurHash3_x86_32::operator()( + bool const& key) const { return this->compute(static_cast(key)); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()(float const& key) const +MurmurHash3_x86_32::result_type __device__ inline MurmurHash3_x86_32::operator()( + float const& key) const { return this->compute(normalize_nans_and_zeros(key)); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()(double const& key) const +MurmurHash3_x86_32::result_type __device__ inline MurmurHash3_x86_32::operator()( + double const& key) const { return this->compute(normalize_nans_and_zeros(key)); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - cudf::string_view const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + cudf::string_view const& key) const { return this->compute_bytes(reinterpret_cast(key.data()), key.size_bytes()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - numeric::decimal32 const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal32 const& key) const { return this->compute(key.value()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - numeric::decimal64 const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal64 const& key) const { return this->compute(key.value()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - numeric::decimal128 const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal128 const& key) const { return this->compute(key.value()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - cudf::list_view const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + cudf::list_view const& key) const { CUDF_UNREACHABLE("List column hashing is not supported"); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - cudf::struct_view const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + cudf::struct_view const& key) const { CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); } diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh new file mode 100644 index 00000000000..bb6e7f18fbc --- /dev/null +++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::hashing::detail { + +template +struct XXHash_32 { + using result_type = std::uint32_t; + + CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } + + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const + { + return this->_impl.compute_hash(bytes, size); + } + + private: + template + __device__ constexpr result_type compute(T const& key) const + { + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + cuco::xxhash_32 _impl; +}; + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(bool const& key) const +{ + return this->compute(static_cast(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(float const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + double const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::string_view const& key) const +{ + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal32 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal64 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal128 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + cudf::list_view const& key) const +{ + CUDF_UNREACHABLE("List column hashing is not supported"); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::struct_view const& key) const +{ + CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp index 0d74a4158ad..4ad760d278f 100644 --- a/cpp/include/cudf/io/nvcomp_adapter.hpp +++ b/cpp/include/cudf/io/nvcomp_adapter.hpp @@ -22,7 +22,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::nvcomp { +namespace io::detail::nvcomp { enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP }; @@ -88,5 +88,5 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para [[nodiscard]] std::optional is_decompression_disabled( compression_type compression, feature_status_parameters params = feature_status_parameters()); -} // namespace io::nvcomp +} // namespace io::detail::nvcomp } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index bfe76d5690c..b561d0989e9 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -410,6 +410,7 @@ class parquet_reader_options_builder { * * @param val Boolean value whether to read matching projected and filter columns from mismatched * Parquet sources. + * * @return this for chaining. */ parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index afefd04d4fa..cc63565eee1 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,13 +34,6 @@ namespace CUDF_EXPORT cudf { -/** - * @brief Enum to indicate whether the distinct join table has nested columns or not - * - * @ingroup column_join - */ -enum class has_nested : bool { YES, NO }; - // forward declaration namespace hashing::detail { @@ -61,7 +54,6 @@ class hash_join; /** * @brief Forward declaration for our distinct hash join */ -template class distinct_hash_join; } // namespace detail @@ -469,20 +461,19 @@ class hash_join { rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; private: - const std::unique_ptr _impl; + std::unique_ptr _impl; }; /** * @brief Distinct hash join that builds hash table in creation and probes results in subsequent * `*_join` member functions * + * This class enables the distinct hash join scheme that builds hash table once, and probes as many + * times as needed (possibly in parallel). + * * @note Behavior is undefined if the build table contains duplicates. * @note All NaNs are considered as equal - * - * @tparam HasNested Flag indicating whether there are nested columns in build/probe table */ -// TODO: `HasNested` to be removed via dispatching -template class distinct_hash_join { public: distinct_hash_join() = delete; @@ -496,15 +487,10 @@ class distinct_hash_join { * @brief Constructs a distinct hash join object for subsequent probe calls * * @param build The build table that contains distinct elements - * @param probe The probe table, from which the keys are probed - * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or - * any `probe` table that will be used later for join * @param compare_nulls Controls whether null join-key values should match or not * @param stream CUDA stream used for device memory operations and kernel launches */ distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - nullable_join has_nulls = nullable_join::YES, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = cudf::get_default_stream()); @@ -512,16 +498,18 @@ class distinct_hash_join { * @brief Returns the row indices that can be used to construct the result of performing * an inner join between two tables. @see cudf::inner_join(). * + * @param probe The probe table, from which the keys are probed * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned indices' device memory. * - * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to + * @return A pair of columns [`probe_indices`, `build_indices`] that can be used to * construct the result of performing an inner join between two tables * with `build` and `probe` as the join keys. */ [[nodiscard]] std::pair>, std::unique_ptr>> - inner_join(rmm::cuda_stream_view stream = cudf::get_default_stream(), + inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; /** @@ -532,19 +520,22 @@ class distinct_hash_join { * the row index of the matched row from the build table if there is a match. Otherwise, contains * `JoinNoneValue`. * + * @param probe The probe table, from which the keys are probed * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device * memory. + * * @return A `build_indices` column that can be used to construct the result of * performing a left join between two tables with `build` and `probe` as the join * keys. */ [[nodiscard]] std::unique_ptr> left_join( + cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; private: - using impl_type = typename cudf::detail::distinct_hash_join; ///< Implementation type + using impl_type = cudf::detail::distinct_hash_join; ///< Implementation type std::unique_ptr _impl; ///< Distinct hash join implementation }; diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp index 85349a421b1..84957ab9f1d 100644 --- a/cpp/include/cudf/strings/detail/utf8.hpp +++ b/cpp/include/cudf/strings/detail/utf8.hpp @@ -31,7 +31,7 @@ namespace strings::detail { * @param chr Any single byte from a valid UTF-8 character * @return true if this is not the first byte of the character */ -constexpr bool is_utf8_continuation_char(unsigned char chr) +CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr) { // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. return (chr & 0xC0) == 0x80; @@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr) * @param chr Any single byte from a valid UTF-8 character * @return true if this the first byte of the character */ -constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); } +CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr) +{ + return not is_utf8_continuation_char(chr); +} /** * @brief This will return true if the passed in byte could be the start of @@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin * @param byte The byte to be tested * @return true if this can be the first byte of a character */ -constexpr bool is_valid_begin_utf8_char(uint8_t byte) +CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte) { // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of: // 0b0vvvvvvv a 1 byte character @@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte) * @param character Single character * @return Number of bytes */ -constexpr size_type bytes_in_char_utf8(char_utf8 character) +CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character) { return 1 + static_cast((character & 0x0000'FF00u) > 0) + static_cast((character & 0x00FF'0000u) > 0) + @@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character) * @param byte Byte from an encoded character. * @return Number of bytes. */ -constexpr size_type bytes_in_utf8_byte(uint8_t byte) +CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte) { return 1 + static_cast((byte & 0xF0) == 0xF0) // 4-byte character prefix + static_cast((byte & 0xE0) == 0xE0) // 3-byte character prefix @@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte) * @param[out] character Single char_utf8 value. * @return The number of bytes in the character */ -constexpr size_type to_char_utf8(char const* str, char_utf8& character) +CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character) { size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); @@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character) * @param[out] str Output array. * @return The number of bytes in the character */ -constexpr inline size_type from_char_utf8(char_utf8 character, char* str) +CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str) { size_type const chr_width = bytes_in_char_utf8(character); for (size_type idx = 0; idx < chr_width; ++idx) { @@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str) * @param utf8_char Single UTF-8 character to convert. * @return Code-point for the UTF-8 character. */ -constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) +CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) { uint32_t unchr = 0; if (utf8_char < 0x0000'0080) // single-byte pass thru @@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) * @param unchr Character code-point to convert. * @return Single UTF-8 character. */ -constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) +CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) { cudf::char_utf8 utf8 = 0; if (unchr < 0x0000'0080) // single byte utf8 diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 1ae4c3703b2..f0040e069d8 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -31,6 +31,8 @@ #include #endif +#include + #include // This file should only include device code logic. @@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte * @param pos Character position to count to * @return The number of bytes and the left over non-counted position value */ -__device__ inline std::pair bytes_to_character_position(string_view d_str, - size_type pos) +__device__ inline cuda::std::pair bytes_to_character_position( + string_view d_str, size_type pos) { size_type bytes = 0; auto ptr = d_str.data(); @@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const __device__ inline size_type string_view::byte_offset(size_type pos) const { if (length() == size_bytes()) return pos; - return std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); + return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); } __device__ inline int string_view::compare(string_view const& in) const diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 504c31057ae..33f3176d2c6 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -54,7 +54,7 @@ class string_view { * * @return The number of characters in this string */ - __device__ [[nodiscard]] inline size_type length() const; + [[nodiscard]] __device__ inline size_type length() const; /** * @brief Return a pointer to the internal device array * @@ -119,13 +119,13 @@ class string_view { * * @return new iterator pointing to the beginning of this string */ - __device__ [[nodiscard]] inline const_iterator begin() const; + [[nodiscard]] __device__ inline const_iterator begin() const; /** * @brief Return new iterator pointing past the end of this string * * @return new iterator pointing past the end of this string */ - __device__ [[nodiscard]] inline const_iterator end() const; + [[nodiscard]] __device__ inline const_iterator end() const; /** * @brief Return single UTF-8 character at the given character position @@ -140,7 +140,7 @@ class string_view { * @param pos Character position * @return Byte offset from data() for a given character position */ - __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const; + [[nodiscard]] __device__ inline size_type byte_offset(size_type pos) const; /** * @brief Comparing target string with this string. Each character is compared @@ -155,7 +155,7 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - __device__ [[nodiscard]] inline int compare(string_view const& str) const; + [[nodiscard]] __device__ inline int compare(string_view const& str) const; /** * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. @@ -225,7 +225,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if str is not found in this string. */ - __device__ [[nodiscard]] inline size_type find(string_view const& str, + [[nodiscard]] __device__ inline size_type find(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -253,7 +253,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type find(char_utf8 character, + [[nodiscard]] __device__ inline size_type find(char_utf8 character, size_type pos = 0, size_type count = -1) const; /** @@ -266,7 +266,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type rfind(string_view const& str, + [[nodiscard]] __device__ inline size_type rfind(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -294,7 +294,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type rfind(char_utf8 character, + [[nodiscard]] __device__ inline size_type rfind(char_utf8 character, size_type pos = 0, size_type count = -1) const; @@ -306,7 +306,7 @@ class string_view { * @param length Number of characters from start to include in the sub-string. * @return New instance pointing to a subset of the characters within this instance. */ - __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const; + [[nodiscard]] __device__ inline string_view substr(size_type start, size_type length) const; /** * @brief Return minimum value associated with the string type @@ -386,7 +386,7 @@ class string_view { * @param bytepos Byte position from start of _data. * @return The character position for the specified byte. */ - __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const; + [[nodiscard]] __device__ inline size_type character_offset(size_type bytepos) const; /** * @brief Common internal implementation for string_view::find and string_view::rfind. diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 3f33c70c29a..8214ea6e83b 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -33,6 +33,8 @@ #include #include +#include +#include #include #include #include @@ -48,11 +50,8 @@ #include #include -#include #include -#include #include -#include namespace CUDF_EXPORT cudf { @@ -287,15 +286,16 @@ class device_row_comparator { * `null_order::BEFORE` for all columns. * @param comparator Physical element relational comparison functor. */ - device_row_comparator(Nullate check_nulls, - table_device_view lhs, - table_device_view rhs, - device_span l_dremel_device_views, - device_span r_dremel_device_views, - std::optional> depth = std::nullopt, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + device_row_comparator( + Nullate check_nulls, + table_device_view lhs, + table_device_view rhs, + device_span l_dremel_device_views, + device_span r_dremel_device_views, + cuda::std::optional> depth = cuda::std::nullopt, + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel(l_dremel_device_views), @@ -331,9 +331,9 @@ class device_row_comparator { Nullate check_nulls, table_device_view lhs, table_device_view rhs, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel{}, @@ -410,7 +410,7 @@ class device_row_comparator { return cuda::std::pair(_comparator(_lhs.element(lhs_element_index), _rhs.element(rhs_element_index)), - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); } /** @@ -455,7 +455,7 @@ class device_row_comparator { } if (lcol.num_child_columns() == 0) { - return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits::max()); + return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits::max()); } // Non-empty structs have been modified to only have 1 child when using this. @@ -607,7 +607,7 @@ class device_row_comparator { __device__ constexpr weak_ordering operator()(size_type const lhs_index, size_type const rhs_index) const noexcept { - int last_null_depth = std::numeric_limits::max(); + int last_null_depth = cuda::std::numeric_limits::max(); size_type list_column_index{-1}; for (size_type i = 0; i < _lhs.num_columns(); ++i) { if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; } @@ -626,9 +626,9 @@ class device_row_comparator { // here, otherwise the current code would be failing. auto const [l_dremel_i, r_dremel_i] = _lhs.column(i).type().id() == type_id::LIST - ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), - optional_dremel_view(_r_dremel[list_column_index])) - : std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); + ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), + optional_dremel_view(_r_dremel[list_column_index])) + : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); auto element_comp = element_comparator{_check_nulls, _lhs.column(i), @@ -658,9 +658,9 @@ class device_row_comparator { device_span const _l_dremel; device_span const _r_dremel; Nullate const _check_nulls; - std::optional> const _depth; - std::optional> const _column_order; - std::optional> const _null_precedence; + cuda::std::optional> const _depth; + cuda::std::optional> const _column_order; + cuda::std::optional> const _null_precedence; PhysicalElementComparator const _comparator; }; // class device_row_comparator @@ -882,10 +882,10 @@ struct preprocessed_table { * @return Device array containing respective column orders. If no explicit column orders were * specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> column_order() const + [[nodiscard]] cuda::std::optional> column_order() const { - return _column_order.size() ? std::optional>(_column_order) - : std::nullopt; + return _column_order.size() ? cuda::std::optional>(_column_order) + : cuda::std::nullopt; } /** @@ -895,10 +895,11 @@ struct preprocessed_table { * @return Device array containing respective column null precedence. If no explicit column null * precedences were specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> null_precedence() const + [[nodiscard]] cuda::std::optional> null_precedence() const { - return _null_precedence.size() ? std::optional>(_null_precedence) - : std::nullopt; + return _null_precedence.size() + ? cuda::std::optional>(_null_precedence) + : cuda::std::nullopt; } /** @@ -909,9 +910,10 @@ struct preprocessed_table { * @return std::optional> Device array containing respective column depths. * If there are no nested columns in the table then this will be `nullopt`. */ - [[nodiscard]] std::optional> depths() const + [[nodiscard]] cuda::std::optional> depths() const { - return _depths.size() ? std::optional>(_depths) : std::nullopt; + return _depths.size() ? cuda::std::optional>(_depths) + : cuda::std::nullopt; } [[nodiscard]] device_span dremel_device_views() const @@ -940,8 +942,8 @@ struct preprocessed_table { rmm::device_uvector const _depths; // Dremel encoding of list columns used for the comparison algorithm - std::optional> _dremel_data; - std::optional> _dremel_device_views; + cuda::std::optional> _dremel_data; + cuda::std::optional> _dremel_device_views; // Intermediate columns generated from transforming nested children columns into // integers columns using `cudf::rank()`, need to be kept alive. @@ -1808,7 +1810,7 @@ class element_hasher { __device__ element_hasher( Nullate nulls, uint32_t seed = DEFAULT_HASH_SEED, - hash_value_type null_hash = std::numeric_limits::max()) noexcept + hash_value_type null_hash = cuda::std::numeric_limits::max()) noexcept : _check_nulls(nulls), _seed(seed), _null_hash(null_hash) { } @@ -1892,7 +1894,7 @@ class device_row_hasher { */ template