diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 13103e8f7..05f11c005 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
"args": {
"CUDA": "11.8",
"PYTHON_PACKAGE_MANAGER": "conda",
- "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 74d62afcc..b4c507f86 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
"args": {
"CUDA": "11.8",
"PYTHON_PACKAGE_MANAGER": "pip",
- "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+ "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
"version": "11.8",
"installcuBLAS": true,
"installcuSOLVER": true,
"installcuRAND": true,
"installcuSPARSE": true
},
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index d6902d3f9..4f8d628c2 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
"args": {
"CUDA": "12.5",
"PYTHON_PACKAGE_MANAGER": "conda",
- "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 3dcf52e83..8e6ba4de8 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,24 +5,24 @@
"args": {
"CUDA": "12.5",
"PYTHON_PACKAGE_MANAGER": "pip",
- "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+ "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
"version": "12.5",
"installcuBLAS": true,
"installcuSOLVER": true,
"installcuRAND": true,
"installcuSPARSE": true
},
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index db20bdbc1..7ac02e365 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
jobs:
cpp-build:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
rust-build:
needs: cpp-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
python-build:
needs: [cpp-build]
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
upload-conda:
needs: [cpp-build, python-build]
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
if: github.ref_type == 'branch'
needs: python-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
with:
arch: "amd64"
branch: ${{ inputs.branch }}
@@ -82,7 +82,7 @@ jobs:
sha: ${{ inputs.sha }}
wheel-build-cuvs:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
wheel-publish-cuvs:
needs: wheel-build-cuvs
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 07b10e85a..78648235f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
jobs:
pr-builder:
needs:
+ - changed-files
- checks
- conda-cpp-build
- conda-cpp-tests
@@ -24,49 +25,87 @@ jobs:
- wheel-tests-cuvs
- devcontainer
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+ if: always()
+ with:
+ needs: ${{ toJSON(needs) }}
+ changed-files:
+ secrets: inherit
+ uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+ with:
+ files_yaml: |
+ test_cpp:
+ - '**'
+ - '!.devcontainer/**'
+ - '!.pre-commit-config.yaml'
+ - '!README.md'
+ - '!docs/**'
+ - '!img/**'
+ - '!notebooks/**'
+ - '!python/**'
+ - '!rust/**'
+ - '!thirdparty/LICENSES/**'
+ test_notebooks:
+ - '**'
+ - '!.devcontainer/**'
+ - '!.pre-commit-config.yaml'
+ - '!README.md'
+ - '!rust/**'
+ - '!thirdparty/LICENSES/**'
+ test_python:
+ - '**'
+ - '!.devcontainer/**'
+ - '!.pre-commit-config.yaml'
+ - '!README.md'
+ - '!docs/**'
+ - '!img/**'
+ - '!notebooks/**'
+ - '!rust/**'
+ - '!thirdparty/LICENSES/**'
checks:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
with:
enable_check_generated_files: false
conda-cpp-build:
needs: checks
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
with:
build_type: pull-request
node_type: cpu16
conda-cpp-tests:
- needs: conda-cpp-build
+ needs: [conda-cpp-build, changed-files]
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+ if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
with:
build_type: pull-request
conda-cpp-checks:
needs: conda-cpp-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
with:
build_type: pull-request
enable_check_symbols: true
- symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+ symbol_exclusions: (void (thrust::|cub::))
conda-python-build:
needs: conda-cpp-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
with:
build_type: pull-request
conda-python-tests:
- needs: conda-python-build
+ needs: [conda-python-build, changed-files]
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+ if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
with:
build_type: pull-request
docs-build:
needs: conda-python-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
with:
build_type: pull-request
node_type: "gpu-v100-latest-1"
@@ -76,7 +115,7 @@ jobs:
rust-build:
needs: conda-cpp-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
with:
build_type: pull-request
node_type: "gpu-v100-latest-1"
@@ -86,20 +125,21 @@ jobs:
wheel-build-cuvs:
needs: checks
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
with:
build_type: pull-request
script: ci/build_wheel_cuvs.sh
wheel-tests-cuvs:
- needs: wheel-build-cuvs
+ needs: [wheel-build-cuvs, changed-files]
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+ if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
with:
build_type: pull-request
script: ci/test_wheel_cuvs.sh
devcontainer:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
with:
arch: '["amd64"]'
cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0821233a1..27dc99a11 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,17 +16,17 @@ on:
jobs:
conda-cpp-checks:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
with:
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
enable_check_symbols: true
- symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+ symbol_exclusions: (void (thrust::|cub::))
conda-cpp-tests:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
with:
build_type: nightly
branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
sha: ${{ inputs.sha }}
conda-python-tests:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
with:
build_type: nightly
branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
sha: ${{ inputs.sha }}
wheel-tests-cuvs:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
with:
build_type: nightly
branch: ${{ inputs.branch }}
diff --git a/.gitignore b/.gitignore
index 97eab287d..da6eb07f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,6 +75,7 @@ compile_commands.json
.clangd/
# serialized ann indexes
+brute_force_index
cagra_index
ivf_flat_index
ivf_pq_index
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 439b42959..5e53abd92 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -91,7 +91,10 @@ repos:
- id: codespell
additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"]
- exclude: (?x)^(^CHANGELOG.md$)
+ exclude: |
+ (?x)
+ ^CHANGELOG[.]md$|
+ ^cpp/cmake/patches/cutlass/build-export[.]patch$
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
@@ -113,7 +116,7 @@ repos:
cpp/cmake/modules/FindAVX\.cmake|
- id: verify-alpha-spec
- repo: https://github.com/rapidsai/dependency-file-generator
- rev: v1.13.11
+ rev: v1.16.0
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ce4a14c3..ed9429d55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,67 @@
+# cuvs 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Skip IVF-PQ packing test for lists with not enough data ([#512](https://github.com/rapidsai/cuvs/pull/512)) [@achirkin](https://github.com/achirkin)
+- [BUG] Fix CAGRA filter ([#489](https://github.com/rapidsai/cuvs/pull/489)) [@enp1s0](https://github.com/enp1s0)
+- Add `kIsSingleSource` to `PairwiseDistanceEpilogueElementwise` ([#485](https://github.com/rapidsai/cuvs/pull/485)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix include errors, header, and unsafe locks in iface.hpp ([#467](https://github.com/rapidsai/cuvs/pull/467)) [@achirkin](https://github.com/achirkin)
+- Fix an OOB error in device-side cuvs::neighbors::refine and CAGRA kern_prune ([#460](https://github.com/rapidsai/cuvs/pull/460)) [@achirkin](https://github.com/achirkin)
+- Put a ceiling on cuda-python ([#445](https://github.com/rapidsai/cuvs/pull/445)) [@bdice](https://github.com/bdice)
+- Enable NVTX in cuvs-cagra-search component ([#439](https://github.com/rapidsai/cuvs/pull/439)) [@achirkin](https://github.com/achirkin)
+- BUG: CAGRA multi-cta illegal access with bad queries ([#438](https://github.com/rapidsai/cuvs/pull/438)) [@achirkin](https://github.com/achirkin)
+- Fix index overflow in edge cases of CAGRA graph optimize ([#435](https://github.com/rapidsai/cuvs/pull/435)) [@achirkin](https://github.com/achirkin)
+- Fix correct call to brute force in generate groundtruth of cuvs-bench ([#427](https://github.com/rapidsai/cuvs/pull/427)) [@dantegd](https://github.com/dantegd)
+- Use Python for sccache hit rate computation. ([#420](https://github.com/rapidsai/cuvs/pull/420)) [@bdice](https://github.com/bdice)
+- Add `click` package to `cuvs-bench` conda recipe ([#408](https://github.com/rapidsai/cuvs/pull/408)) [@divyegala](https://github.com/divyegala)
+- Fix NVTX annotations ([#400](https://github.com/rapidsai/cuvs/pull/400)) [@achirkin](https://github.com/achirkin)
+
+## 📖 Documentation
+
+- [Doc] Fix CAGRA search sample code ([#484](https://github.com/rapidsai/cuvs/pull/484)) [@enp1s0](https://github.com/enp1s0)
+- Fix broken link in README.md references ([#473](https://github.com/rapidsai/cuvs/pull/473)) [@Azurethi](https://github.com/Azurethi)
+- Adding tech stack to docs ([#448](https://github.com/rapidsai/cuvs/pull/448)) [@cjnolet](https://github.com/cjnolet)
+- Fix Question Retrieval notebook ([#352](https://github.com/rapidsai/cuvs/pull/352)) [@lowener](https://github.com/lowener)
+
+## 🚀 New Features
+
+- Add C++ API scalar quantization ([#494](https://github.com/rapidsai/cuvs/pull/494)) [@mfoerste4](https://github.com/mfoerste4)
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Add serialization API to brute-force ([#461](https://github.com/rapidsai/cuvs/pull/461)) [@lowener](https://github.com/lowener)
+- Add Question Retrieval notebook using Milvus ([#451](https://github.com/rapidsai/cuvs/pull/451)) [@lowener](https://github.com/lowener)
+- Migrate feature diff for NN Descent from RAFT to cuVS ([#421](https://github.com/rapidsai/cuvs/pull/421)) [@divyegala](https://github.com/divyegala)
+- Add --no-lap-sync cmd option to ann-bench ([#405](https://github.com/rapidsai/cuvs/pull/405)) [@achirkin](https://github.com/achirkin)
+- Add `InnerProduct` and `CosineExpanded` metric support in NN Descent ([#177](https://github.com/rapidsai/cuvs/pull/177)) [@divyegala](https://github.com/divyegala)
+
+## 🛠️ Improvements
+
+- Update cuvs to match raft's cutlass changes ([#516](https://github.com/rapidsai/cuvs/pull/516)) [@vyasr](https://github.com/vyasr)
+- add a README for wheels ([#504](https://github.com/rapidsai/cuvs/pull/504)) [@jameslamb](https://github.com/jameslamb)
+- Move check_input_array from pylibraft ([#474](https://github.com/rapidsai/cuvs/pull/474)) [@benfred](https://github.com/benfred)
+- use different wheel-size thresholds based on CUDA version ([#469](https://github.com/rapidsai/cuvs/pull/469)) [@jameslamb](https://github.com/jameslamb)
+- Modify cuvs-bench to be able to generate ground truth in CPU systems ([#466](https://github.com/rapidsai/cuvs/pull/466)) [@dantegd](https://github.com/dantegd)
+- enforce wheel size limits, README formatting in CI ([#464](https://github.com/rapidsai/cuvs/pull/464)) [@jameslamb](https://github.com/jameslamb)
+- Moving spectral embedding and kernel gramm APIs to cuVS ([#463](https://github.com/rapidsai/cuvs/pull/463)) [@cjnolet](https://github.com/cjnolet)
+- Migrate sparse knn and distances code from raft ([#457](https://github.com/rapidsai/cuvs/pull/457)) [@benfred](https://github.com/benfred)
+- Don't presume pointers location infers usability. ([#441](https://github.com/rapidsai/cuvs/pull/441)) [@robertmaynard](https://github.com/robertmaynard)
+- call `enable_testing` in root CMakeLists.txt ([#437](https://github.com/rapidsai/cuvs/pull/437)) [@robertmaynard](https://github.com/robertmaynard)
+- CAGRA tech debt: distance descriptor and workspace memory ([#436](https://github.com/rapidsai/cuvs/pull/436)) [@achirkin](https://github.com/achirkin)
+- Add ci run_ scripts needed for build infra ([#434](https://github.com/rapidsai/cuvs/pull/434)) [@robertmaynard](https://github.com/robertmaynard)
+- Use environment variables in cache hit rate computation. ([#422](https://github.com/rapidsai/cuvs/pull/422)) [@bdice](https://github.com/bdice)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+- We need to enable the c_api by default ([#416](https://github.com/rapidsai/cuvs/pull/416)) [@robertmaynard](https://github.com/robertmaynard)
+- print sccache stats in builds ([#413](https://github.com/rapidsai/cuvs/pull/413)) [@jameslamb](https://github.com/jameslamb)
+- make conda installs in CI stricter ([#406](https://github.com/rapidsai/cuvs/pull/406)) [@jameslamb](https://github.com/jameslamb)
+- Ivf c example ([#404](https://github.com/rapidsai/cuvs/pull/404)) [@abner-ma](https://github.com/abner-ma)
+- Prune workflows based on changed files ([#392](https://github.com/rapidsai/cuvs/pull/392)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- [WIP] Add pinned memory resource to C API ([#311](https://github.com/rapidsai/cuvs/pull/311)) [@ajit283](https://github.com/ajit283)
+- Dynamic Batching ([#261](https://github.com/rapidsai/cuvs/pull/261)) [@achirkin](https://github.com/achirkin)
+
# cuvs 24.10.00 (9 Oct 2024)
## 🐛 Bug Fixes
diff --git a/README.md b/README.md
index 213fde632..23759f598 100755
--- a/README.md
+++ b/README.md
@@ -35,6 +35,7 @@ Finally, faster vector search enables interactions between dense vectors and gra
Below are some common use-cases for vector search
+
- ### Semantic search
- Generative AI & Retrieval augmented generation (RAG)
- Recommender systems
@@ -68,6 +69,14 @@ There are several benefits to using cuVS and GPUs for vector search, including
In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale.
+## cuVS Technology Stack
+
+cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU.
+
+![cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU](img/tech_stack.png "cuVS Technology Stack")
+
+
+
## Installing cuVS
cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/). Different packages are available for the different languages supported by cuVS:
@@ -100,7 +109,7 @@ pip install cuvs-cu12 --extra-index-url=https://pypi.nvidia.com
If installing a version that has not yet been released, the `rapidsai` channel can be replaced with `rapidsai-nightly`:
```bash
-conda install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.10
+conda install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.12
```
cuVS also has `pip` wheel packages that can be installed. Please see the [Build and Install Guide](https://docs.rapids.ai/api/cuvs/nightly/build/) for more information on installing the available cuVS packages and building from source.
@@ -233,7 +242,7 @@ If you are interested in contributing to the cuVS library, please read our [Cont
For the interested reader, many of the accelerated implementations in cuVS are also based on research papers which can provide a lot more background. We also ask you to please cite the corresponding algorithms by referencing them in your own research.
- [CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search](https://arxiv.org/abs/2308.15136)
-- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062>)
+- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062)
- [Fast K-NN Graph Construction by GPU Based NN-Descent](https://dl.acm.org/doi/abs/10.1145/3459637.3482344?casa_token=O_nan1B1F5cAAAAA:QHWDEhh0wmd6UUTLY9_Gv6c3XI-5DXM9mXVaUXOYeStlpxTPmV3nKvABRfoivZAaQ3n8FWyrkWw>)
- [cuSLINK: Single-linkage Agglomerative Clustering on the GPU](https://arxiv.org/abs/2306.16354)
- [GPU Semiring Primitives for Sparse Neighborhood Methods](https://arxiv.org/abs/2104.06357)
diff --git a/VERSION b/VERSION
index 7c7ba0443..af28c42b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/build.sh b/build.sh
index b787d3a41..bd5fa649b 100755
--- a/build.sh
+++ b/build.sh
@@ -76,8 +76,8 @@ BUILD_REPORT_METRICS=""
BUILD_REPORT_INCL_CACHE_STATS=OFF
BUILD_SHARED_LIBS=ON
-TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST"
-ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL"
+TEST_TARGETS=""
+ANN_BENCH_TARGETS=""
CACHE_ARGS=""
NVTX=ON
@@ -273,14 +273,6 @@ fi
if hasArg tests || (( ${NUMARGS} == 0 )); then
BUILD_TESTS=ON
CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
-
- # Force compile library when needed test targets are specified
- if [[ $CMAKE_TARGET == *"CAGRA_C_TEST"* || \
- $CMAKE_TARGET == *"INTEROP_TEST"* || \
- $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* ]]; then
- echo "-- Enabling compiled lib for gtests"
- COMPILE_LIBRARY=ON
- fi
fi
if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
@@ -410,14 +402,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
if [[ ${CACHE_TOOL} == "sccache" && -x "$(command -v sccache)" ]]; then
COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }')
- HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+ HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')")
MSG="${MSG}
cache hit rate ${HIT_RATE} %"
elif [[ ${CACHE_TOOL} == "ccache" && -x "$(command -v ccache)" ]]; then
CACHE_STATS_LINE=$(ccache -s | grep "Hits: \+ [0-9]\+ / [0-9]\+" | tail -n1)
if [[ ! -z "$CACHE_STATS_LINE" ]]; then
CACHE_HITS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $2 }')
COMPILE_REQUESTS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $4 }')
- HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+ HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')")
MSG="${MSG}
cache hit rate ${HIT_RATE} %"
fi
fi
@@ -447,7 +439,7 @@ if (( ${NUMARGS} == 0 )) || hasArg python; then
python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs
fi
-# Build and (optionally) install the cuvs_bench Python package
+# Build and (optionally) install the cuvs-bench Python package
if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then
python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs_bench
fi
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 7bc0be5a7..db4c496cc 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,6 +15,10 @@ rapids-print-env
rapids-logger "Begin cpp build"
+sccache --zero-stats
+
RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/libcuvs
+sccache --show-adv-stats
+
rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 460cc3899..bce93c605 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,6 +6,9 @@ set -euo pipefail
rapids-logger "Create test conda environment"
. /opt/conda/etc/profile.d/conda.sh
+RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
rapids-dependency-file-generator \
--output conda \
--file-key docs \
@@ -28,11 +31,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
--channel "${PYTHON_CHANNEL}" \
- libcuvs cuvs
+ "libcuvs=${RAPIDS_VERSION}" \
+ "cuvs=${RAPIDS_VERSION}"
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
export RAPIDS_DOCS_DIR="$(mktemp -d)"
rapids-logger "Build CPP docs"
@@ -54,4 +55,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/cuvs/"html
mv _html/* "${RAPIDS_DOCS_DIR}/cuvs/html"
popd
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 7b0c639af..3241a2c2b 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -24,6 +24,8 @@ version=$(rapids-generate-version)
export RAPIDS_PACKAGE_VERSION=${version}
echo "${version}" > VERSION
+sccache --zero-stats
+
# TODO: Remove `--no-test` flags once importing on a CPU
# node works correctly
rapids-conda-retry mambabuild \
@@ -31,14 +33,20 @@ rapids-conda-retry mambabuild \
--channel "${CPP_CHANNEL}" \
conda/recipes/cuvs
-# Build cuvs_bench for each cuda and python version
+sccache --show-adv-stats
+sccache --zero-stats
+
+# Build cuvs-bench for each cuda and python version
rapids-conda-retry mambabuild \
--no-test \
--channel "${CPP_CHANNEL}" \
--channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
- conda/recipes/cuvs_bench
+ conda/recipes/cuvs-bench
-# Build cuvs_bench_cpu only in CUDA 12 jobs since it only depends on python
+sccache --show-adv-stats
+sccache --zero-stats
+
+# Build cuvs-bench-cpu only in CUDA 12 jobs since it only depends on python
# version
RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
@@ -46,7 +54,9 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
--no-test \
--channel "${CPP_CHANNEL}" \
--channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
- conda/recipes/cuvs_bench_cpu
+ conda/recipes/cuvs-bench-cpu
+
+ sccache --show-adv-stats
fi
rapids-upload-conda-to-s3 python
diff --git a/ci/build_rust.sh b/ci/build_rust.sh
index 31d0de053..309501c32 100755
--- a/ci/build_rust.sh
+++ b/ci/build_rust.sh
@@ -6,6 +6,8 @@ set -euo pipefail
rapids-logger "Create test conda environment"
. /opt/conda/etc/profile.d/conda.sh
+RAPIDS_VERSION="$(rapids-version)"
+
rapids-dependency-file-generator \
--output conda \
--file-key rust \
@@ -32,7 +34,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
# installing libcuvs/libraft will speed up the rust build substantially
rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
- libcuvs \
- libraft
+ "libcuvs=${RAPIDS_VERSION}" \
+ "libraft=${RAPIDS_VERSION}"
bash ./build.sh rust
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index d1030276f..4994374a8 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -32,10 +32,20 @@ case "${RAPIDS_CUDA_VERSION}" in
;;
esac
-# Hardcode the output dir
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+rapids-logger "Building '${package_name}' wheel"
+
+sccache --zero-stats
+
+python -m pip wheel \
+ -w dist \
+ -v \
+ --no-deps \
+ --disable-pip-version-check \
+ .
+
+sccache --show-adv-stats
mkdir -p final_dist
python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh
index e03da9f19..444657cc0 100755
--- a/ci/build_wheel_cuvs.sh
+++ b/ci/build_wheel_cuvs.sh
@@ -3,6 +3,8 @@
set -euo pipefail
+package_dir="python/cuvs"
+
case "${RAPIDS_CUDA_VERSION}" in
12.*)
EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
@@ -15,4 +17,5 @@ esac
# Set up skbuild options. Enable sccache in skbuild config options
export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}"
-ci/build_wheel.sh cuvs python/cuvs
+ci/build_wheel.sh cuvs ${package_dir}
+ci/validate_wheel.sh ${package_dir} final_dist
diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh
new file mode 100755
index 000000000..6bf83961b
--- /dev/null
+++ b/ci/run_ctests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support customizing the ctests' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcuvs/"
+
+ctest --output-on-failure --no-tests=error "$@"
diff --git a/ci/run_cuvs_pytests.sh b/ci/run_cuvs_pytests.sh
new file mode 100755
index 000000000..4de8927b1
--- /dev/null
+++ b/ci/run_cuvs_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuvs/cuvs
+
+pytest --cache-clear --verbose "$@" tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 6dfc2cf71..134dc4421 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -5,6 +5,8 @@ set -euo pipefail
. /opt/conda/etc/profile.d/conda.sh
+RAPIDS_VERSION="$(rapids-version)"
+
rapids-logger "Generate C++ testing dependencies"
rapids-dependency-file-generator \
--output conda \
@@ -26,7 +28,8 @@ rapids-print-env
rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
- libcuvs libcuvs-tests
+ "libcuvs=${RAPIDS_VERSION}" \
+ "libcuvs-tests=${RAPIDS_VERSION}"
rapids-logger "Check GPU usage"
nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 93bc597cf..b9c394062 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -5,6 +5,8 @@ set -euo pipefail
. /opt/conda/etc/profile.d/conda.sh
+RAPIDS_VERSION="$(rapids-version)"
+
rapids-logger "Generate Python testing dependencies"
rapids-dependency-file-generator \
--output conda \
@@ -31,7 +33,8 @@ rapids-print-env
rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
--channel "${PYTHON_CHANNEL}" \
- libcuvs cuvs
+ "libcuvs=${RAPIDS_VERSION}" \
+ "cuvs=${RAPIDS_VERSION}"
rapids-logger "Check GPU usage"
nvidia-smi
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 000000000..f2b235765
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir=$1
+wheel_dir_relative_path=$2
+
+RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
+
+# some packages are much larger on CUDA 11 than on CUDA 12
+if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
+ PYDISTCHECK_ARGS=(
+ --max-allowed-size-compressed '1.4G'
+ )
+else
+ PYDISTCHECK_ARGS=(
+ --max-allowed-size-compressed '950M'
+ )
+fi
+
+cd "${package_dir}"
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+ --inspect \
+ "${PYDISTCHECK_ARGS[@]}" \
+ "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+ --strict \
+ "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 065851064..80bfb0c24 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
- cmake>=3.26.4,!=3.30.0
- cuda-nvtx=11.8
- cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
- cuda-version=11.8
- cudatoolkit
- cupy>=12.0.0
@@ -35,7 +35,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- make
- nccl>=2.19
- ninja
@@ -45,7 +45,7 @@ dependencies:
- openblas
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a25393050..07937726c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
- cmake>=3.26.4,!=3.30.0
- cuda-nvtx=11.8
- cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
- cuda-version=11.8
- cudatoolkit
- cupy>=12.0.0
@@ -35,7 +35,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- make
- nccl>=2.19
- ninja
@@ -45,7 +45,7 @@ dependencies:
- openblas
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index bb4a96d48..b7fd6fcfa 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
- cuda-nvcc
- cuda-nvtx-dev
- cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
- cuda-version=12.5
- cupy>=12.0.0
- cxx-compiler
@@ -32,7 +32,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- make
- nccl>=2.19
- ninja
@@ -41,7 +41,7 @@ dependencies:
- openblas
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index bd1b95ae8..83a457465 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
- cuda-nvcc
- cuda-nvtx-dev
- cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
- cuda-version=12.5
- cupy>=12.0.0
- cxx-compiler
@@ -32,7 +32,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- make
- nccl>=2.19
- ninja
@@ -41,7 +41,7 @@ dependencies:
- openblas
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 554ad41ab..59d471bda 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -15,16 +15,17 @@ dependencies:
- cmake>=3.26.4,!=3.30.0
- cuda-nvtx=11.8
- cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
- cuda-version=11.8
- cudatoolkit
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
- cxx-compiler
- cython>=3.0.0
- dlpack>=0.8,<1.0
- gcc_linux-aarch64=11.*
- glog>=0.6.0
- h5py>=3.8.0
-- hnswlib=0.6.2
- libcublas-dev=11.11.3.6
- libcublas=11.11.3.6
- libcurand-dev=10.3.0.86
@@ -33,7 +34,8 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- matplotlib
- nccl>=2.19
- ninja
@@ -41,7 +43,7 @@ dependencies:
- nvcc_linux-aarch64=11.8
- openblas
- pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- setuptools
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index dc38f3565..31a416eb5 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -15,16 +15,17 @@ dependencies:
- cmake>=3.26.4,!=3.30.0
- cuda-nvtx=11.8
- cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
- cuda-version=11.8
- cudatoolkit
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
- cxx-compiler
- cython>=3.0.0
- dlpack>=0.8,<1.0
- gcc_linux-64=11.*
- glog>=0.6.0
- h5py>=3.8.0
-- hnswlib=0.6.2
- libcublas-dev=11.11.3.6
- libcublas=11.11.3.6
- libcurand-dev=10.3.0.86
@@ -33,7 +34,8 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- matplotlib
- nccl>=2.19
- ninja
@@ -41,7 +43,7 @@ dependencies:
- nvcc_linux-64=11.8
- openblas
- pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- setuptools
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index aeb23a9ef..3efe9ebde 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -17,27 +17,29 @@ dependencies:
- cuda-nvcc
- cuda-nvtx-dev
- cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
- cuda-version=12.5
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
- cxx-compiler
- cython>=3.0.0
- dlpack>=0.8,<1.0
- gcc_linux-aarch64=11.*
- glog>=0.6.0
- h5py>=3.8.0
-- hnswlib=0.6.2
- libcublas-dev
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- matplotlib
- nccl>=2.19
- ninja
- nlohmann_json>=3.11.2
- openblas
- pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- setuptools
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 3a408cd64..7fbd77368 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -17,27 +17,29 @@ dependencies:
- cuda-nvcc
- cuda-nvtx-dev
- cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
- cuda-version=12.5
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
- cxx-compiler
- cython>=3.0.0
- dlpack>=0.8,<1.0
- gcc_linux-64=11.*
- glog>=0.6.0
- h5py>=3.8.0
-- hnswlib=0.6.2
- libcublas-dev
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
- matplotlib
- nccl>=2.19
- ninja
- nlohmann_json>=3.11.2
- openblas
- pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- setuptools
diff --git a/conda/recipes/cuvs_bench_cpu/build.sh b/conda/recipes/cuvs-bench-cpu/build.sh
similarity index 100%
rename from conda/recipes/cuvs_bench_cpu/build.sh
rename to conda/recipes/cuvs-bench-cpu/build.sh
diff --git a/conda/recipes/cuvs_bench_cpu/conda_build_config.yaml b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
similarity index 100%
rename from conda/recipes/cuvs_bench_cpu/conda_build_config.yaml
rename to conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
diff --git a/conda/recipes/cuvs_bench_cpu/meta.yaml b/conda/recipes/cuvs-bench-cpu/meta.yaml
similarity index 96%
rename from conda/recipes/cuvs_bench_cpu/meta.yaml
rename to conda/recipes/cuvs-bench-cpu/meta.yaml
index 0ce5db744..016df56be 100644
--- a/conda/recipes/cuvs_bench_cpu/meta.yaml
+++ b/conda/recipes/cuvs-bench-cpu/meta.yaml
@@ -8,7 +8,7 @@
{% set date_string = environ['RAPIDS_DATE_STRING'] %}
package:
- name: cuvs_bench_cpu
+ name: cuvs-bench-cpu
version: {{ version }}
script: build.sh
@@ -55,9 +55,11 @@ requirements:
run:
- benchmark
+ - click
- glog {{ glog_version }}
- h5py {{ h5py_version }}
- matplotlib
+ - numpy >=1.23,<3.0a0
- pandas
- pyyaml
- python
diff --git a/conda/recipes/cuvs_bench/build.sh b/conda/recipes/cuvs-bench/build.sh
similarity index 100%
rename from conda/recipes/cuvs_bench/build.sh
rename to conda/recipes/cuvs-bench/build.sh
diff --git a/conda/recipes/cuvs_bench/conda_build_config.yaml b/conda/recipes/cuvs-bench/conda_build_config.yaml
similarity index 100%
rename from conda/recipes/cuvs_bench/conda_build_config.yaml
rename to conda/recipes/cuvs-bench/conda_build_config.yaml
diff --git a/conda/recipes/cuvs_bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
similarity index 97%
rename from conda/recipes/cuvs_bench/meta.yaml
rename to conda/recipes/cuvs-bench/meta.yaml
index 9ecbf82bb..0681a1038 100644
--- a/conda/recipes/cuvs_bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -10,7 +10,7 @@
{% set date_string = environ['RAPIDS_DATE_STRING'] %}
package:
- name: cuvs_bench
+ name: cuvs-bench
version: {{ version }}
script: build.sh
@@ -82,15 +82,17 @@ requirements:
run:
- benchmark
+ - click
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
{% if cuda_major == "11" %}
- cudatoolkit
{% else %}
- cuda-cudart
+ - cupy>=12.0.0
- libcublas
{% endif %}
- glog {{ glog_version }}
- - libcuvs {{ version }}
+ - cuvs {{ version }}
- h5py {{ h5py_version }}
- matplotlib
- pandas
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index e7e2daf0c..560c95feb 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -26,6 +26,7 @@ build:
- {{ compiler('cuda') }}
- cuda-cudart-dev
{% endif %}
+ - cuda-python
requirements:
build:
@@ -42,10 +43,10 @@ requirements:
- {{ stdlib("c") }}
host:
{% if cuda_major == "11" %}
- - cuda-python >=11.7.1,<12.0a0
+ - cuda-python >=11.7.1,<12.0a0,<=11.8.3
- cudatoolkit
{% else %}
- - cuda-python >=12.0,<13.0a0
+ - cuda-python >=12.0,<13.0a0,<=12.6.0
- cuda-cudart-dev
{% endif %}
- cuda-version ={{ cuda_version }}
@@ -60,13 +61,14 @@ requirements:
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
{% if cuda_major == "11" %}
- cudatoolkit
+ - cuda-python >=11.7.1,<12.0a0,<=11.8.3
{% else %}
- cuda-cudart
+ - cuda-python >=12.0,<13.0a0,<=12.6.0
{% endif %}
- pylibraft {{ minor_version }}
- libcuvs {{ version }}
- python x.x
- - cuda-python
- numpy >=1.23,<3.0a0
tests:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3e98a247e..95fb7e63b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -53,8 +53,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON)
option(BUILD_TESTS "Build cuvs unit-tests" ON)
-option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
-option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
+option(BUILD_C_LIBRARY "Build cuVS C API library" ON)
option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
option(BUILD_MG_ALGOS "Build with multi-GPU support" ON)
@@ -72,21 +71,12 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
option(CUVS_NVTX "Enable nvtx markers" OFF)
option(CUVS_RAFT_CLONE_ON_PIN "Explicitly clone RAFT branch when pinned to non-feature branch" ON)
-if((BUILD_TESTS OR BUILD_C_LIBRARY) AND NOT BUILD_CPU_ONLY)
-
-endif()
-
if(BUILD_CPU_ONLY)
set(BUILD_SHARED_LIBS OFF)
set(BUILD_TESTS OFF)
set(BUILD_C_LIBRARY OFF)
-endif()
-
-if(NOT BUILD_C_LIBRARY)
- set(BUILD_C_TESTS OFF)
-endif()
-
-if(NOT BUILD_SHARED_LIBS)
+ set(BUILD_CAGRA_HNSWLIB OFF)
+elseif(NOT BUILD_SHARED_LIBS)
set(BUILD_TESTS OFF)
set(BUILD_C_LIBRARY OFF)
set(BUILD_CAGRA_HNSWLIB OFF)
@@ -334,6 +324,9 @@ if(BUILD_SHARED_LIBS)
src/cluster/kmeans_transform_float.cu
src/cluster/single_linkage_float.cu
src/core/bitset.cu
+ src/distance/detail/kernels/gram_matrix.cu
+ src/distance/detail/kernels/kernel_factory.cu
+ src/distance/detail/kernels/kernel_matrices.cu
src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu
src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
@@ -379,7 +372,10 @@ if(BUILD_SHARED_LIBS)
src/distance/detail/fused_distance_nn.cu
src/distance/distance.cu
src/distance/pairwise_distance.cu
+ src/distance/sparse_distance.cu
+ src/embed/spectral.cu
src/neighbors/brute_force.cu
+ src/neighbors/brute_force_serialize.cu
src/neighbors/cagra_build_float.cu
src/neighbors/cagra_build_half.cu
src/neighbors/cagra_build_int8.cu
@@ -405,6 +401,7 @@ if(BUILD_SHARED_LIBS)
src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
src/neighbors/detail/cagra/cagra_build.cpp
src/neighbors/detail/cagra/topk_for_cagra/topk.cu
+ src/neighbors/dynamic_batching.cu
$<$:src/neighbors/hnsw.cpp>
src/neighbors/ivf_flat_index.cpp
src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
@@ -446,6 +443,7 @@ if(BUILD_SHARED_LIBS)
src/neighbors/nn_descent.cu
src/neighbors/nn_descent_float.cu
src/neighbors/nn_descent_half.cu
+ src/neighbors/nn_descent_index.cpp
src/neighbors/nn_descent_int8.cu
src/neighbors/nn_descent_uint8.cu
src/neighbors/reachability.cu
@@ -458,12 +456,14 @@ if(BUILD_SHARED_LIBS)
src/neighbors/refine/detail/refine_host_int8_t_float.cpp
src/neighbors/refine/detail/refine_host_uint8_t_float.cpp
src/neighbors/sample_filter.cu
+ src/neighbors/sparse_brute_force.cu
src/neighbors/vamana_build_float.cu
src/neighbors/vamana_build_uint8.cu
src/neighbors/vamana_build_int8.cu
src/neighbors/vamana_serialize_float.cu
src/neighbors/vamana_serialize_uint8.cu
src/neighbors/vamana_serialize_int8.cu
+ src/preprocessing/quantize/scalar.cu
src/selection/select_k_float_int64_t.cu
src/selection/select_k_float_int32_t.cu
src/selection/select_k_float_uint32_t.cu
@@ -583,6 +583,7 @@ if(BUILD_SHARED_LIBS)
if(BUILD_CAGRA_HNSWLIB)
target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib)
+ target_compile_definitions(cuvs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
endif()
@@ -613,6 +614,9 @@ SECTIONS
# This enables NVTX within the project with no option to disable it downstream.
target_link_libraries(cuvs PUBLIC CUDA::nvtx3)
target_compile_definitions(cuvs PUBLIC NVTX_ENABLED)
+
+ target_link_libraries(cuvs-cagra-search PUBLIC CUDA::nvtx3)
+ target_compile_definitions(cuvs-cagra-search PUBLIC NVTX_ENABLED)
else()
# Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
# which is set by default to OFF, but can be enabled in the dependent project.
@@ -771,7 +775,8 @@ endif()
# ##################################################################################################
# * build test executable ----------------------------------------------------
-if(BUILD_TESTS OR BUILD_C_TESTS)
+if(BUILD_TESTS)
+ enable_testing()
add_subdirectory(internal)
add_subdirectory(test)
endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index c36e70ace..c161a68bc 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -90,21 +90,6 @@ if(CUVS_ANN_BENCH_USE_FAISS)
include(cmake/thirdparty/get_faiss)
endif()
-# ##################################################################################################
-# * Enable NVTX if available
-
-# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench
-# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just
-# raft/cuVS.
-if(TARGET CUDA::nvtx3)
- set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
- get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES)
- unset(NVTX3_HEADERS_FOUND CACHE)
- # Check the headers explicitly to make sure the cpu-only build succeeds
- CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
- set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
-endif()
-
# ##################################################################################################
# * Target function -------------------------------------------------------------
@@ -130,12 +115,9 @@ function(ConfigureAnnBench)
add_dependencies(${BENCH_NAME} ANN_BENCH)
else()
add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH})
- target_compile_definitions(
- ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN
- $<$:ANN_BENCH_NVTX3_HEADERS_FOUND>
- )
+ target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN>)
target_link_libraries(
- ${BENCH_NAME} PRIVATE benchmark::benchmark $<$:CUDA::nvtx3>
+ ${BENCH_NAME} PRIVATE benchmark::benchmark $<$:CUDA::nvtx3>
)
endif()
@@ -243,9 +225,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA)
endif()
if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
- ConfigureAnnBench(
- NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib
- )
+ ConfigureAnnBench(NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs)
endif()
if(CUVS_ANN_BENCH_USE_CUVS_MG)
@@ -318,7 +298,7 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
target_link_libraries(
ANN_BENCH
PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
- spdlog::spdlog_header_only $<$:CUDA::nvtx3>
+ spdlog::spdlog_header_only $<$:CUDA::nvtx3>
)
set_target_properties(
ANN_BENCH
@@ -336,7 +316,6 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
ANN_BENCH
PRIVATE
$<$:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}">
- $<$:ANN_BENCH_NVTX3_HEADERS_FOUND>
)
target_link_options(ANN_BENCH PRIVATE -export-dynamic)
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index db3e533e0..06e1e27af 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -119,7 +119,8 @@ template
void bench_build(::benchmark::State& state,
std::shared_ptr> dataset,
configuration::index index,
- bool force_overwrite)
+ bool force_overwrite,
+ bool no_lap_sync)
{
// NB: these two thread-local vars can be used within algo wrappers
cuvs::bench::benchmark_thread_id = state.thread_index();
@@ -149,9 +150,22 @@ void bench_build(::benchmark::State& state,
cuda_timer gpu_timer{algo};
{
nvtx_case nvtx{state.name()};
+ /* Note: GPU timing
+
+ The GPU time is measured between construction and destruction of `cuda_lap` objects (`gpu_all`
+ and `gpu_lap` variables) and added to the `gpu_timer` object.
+
+ We sync with the GPU (cudaEventSynchronize) either each iteration (lifetime of the `gpu_lap`
+ variable) or once per benchmark loop (lifetime of the `gpu_all` variable). The decision is
+
+ controlled by the `no_lap_sync` argument. In either case, we need at least one sync throughout
+ the benchmark loop to make sure the GPU has finished its work before we measure the total run
+ time.
+ */
+ [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync);
for (auto _ : state) {
[[maybe_unused]] auto ntx_lap = nvtx.lap();
- [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+ [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync);
try {
algo->build(base_set, index_size);
} catch (const std::exception& e) {
@@ -173,7 +187,8 @@ template
void bench_search(::benchmark::State& state,
configuration::index index,
std::size_t search_param_ix,
- std::shared_ptr> dataset)
+ std::shared_ptr> dataset,
+ bool no_lap_sync)
{
// NB: these two thread-local vars can be used within algo wrappers
cuvs::bench::benchmark_thread_id = state.thread_index();
@@ -300,25 +315,29 @@ void bench_search(::benchmark::State& state,
// Initialize with algo, so that the timer.lap() object can sync with algo::get_sync_stream()
cuda_timer gpu_timer{a};
auto start = std::chrono::high_resolution_clock::now();
- for (auto _ : state) {
- [[maybe_unused]] auto ntx_lap = nvtx.lap();
- [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
- try {
- a->search(query_set + batch_offset * dataset->dim(),
- n_queries,
- k,
- neighbors_ptr + out_offset * k,
- distances_ptr + out_offset * k);
- } catch (const std::exception& e) {
- state.SkipWithError("Benchmark loop: " + std::string(e.what()));
- break;
- }
+ {
+ /* See the note above: GPU timing */
+ [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync);
+ for (auto _ : state) {
+ [[maybe_unused]] auto ntx_lap = nvtx.lap();
+ [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync);
+ try {
+ a->search(query_set + batch_offset * dataset->dim(),
+ n_queries,
+ k,
+ neighbors_ptr + out_offset * k,
+ distances_ptr + out_offset * k);
+ } catch (const std::exception& e) {
+ state.SkipWithError("Benchmark loop: " + std::string(e.what()));
+ break;
+ }
- // advance to the next batch
- batch_offset = (batch_offset + queries_stride) % query_set_size;
- out_offset = (out_offset + n_queries) % query_set_size;
+ // advance to the next batch
+ batch_offset = (batch_offset + queries_stride) % query_set_size;
+ out_offset = (out_offset + n_queries) % query_set_size;
- queries_processed += n_queries;
+ queries_processed += n_queries;
+ }
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast>(end - start).count();
@@ -379,44 +398,51 @@ void bench_search(::benchmark::State& state,
inline void printf_usage()
{
::benchmark::PrintDefaultHelp();
- fprintf(stdout,
- " [--build|--search] \n"
- " [--force]\n"
- " [--data_prefix=]\n"
- " [--index_prefix=]\n"
- " [--override_kv=]\n"
- " [--mode=\n"
- " [--threads=min[:max]]\n"
- " .json\n"
- "\n"
- "Note the non-standard benchmark parameters:\n"
- " --build: build mode, will build index\n"
- " --search: search mode, will search using the built index\n"
- " one and only one of --build and --search should be specified\n"
- " --force: force overwriting existing index files\n"
- " --data_prefix=:"
- " prepend to dataset file paths specified in the .json (default = "
- "'data/').\n"
- " --index_prefix=:"
- " prepend to index file paths specified in the .json (default = "
- "'index/').\n"
- " --override_kv=:"
- " override a build/search key one or more times multiplying the number of configurations;"
- " you can use this parameter multiple times to get the Cartesian product of benchmark"
- " configs.\n"
- " --mode="
- " run the benchmarks in latency (accumulate times spent in each batch) or "
- " throughput (pipeline batches and measure end-to-end) mode\n"
- " --threads=min[:max] specify the number threads to use for throughput benchmark."
- " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified,"
- " then a single test is run with 'min' threads. By default min=1, max=.\n");
+ fprintf(
+ stdout,
+ " [--build|--search] \n"
+ " [--force]\n"
+ " [--data_prefix=]\n"
+ " [--index_prefix=]\n"
+ " [--override_kv=]\n"
+ " [--mode=\n"
+ " [--threads=min[:max]]\n"
+ " [--no-lap-sync]\n"
+ " .json\n"
+ "\n"
+ "Note the non-standard benchmark parameters:\n"
+ " --build: build mode, will build index\n"
+ " --search: search mode, will search using the built index\n"
+ " one and only one of --build and --search should be specified\n"
+ " --force: force overwriting existing index files\n"
+ " --data_prefix=:"
+ " prepend to dataset file paths specified in the .json (default = "
+ "'data/').\n"
+ " --index_prefix=:"
+ " prepend to index file paths specified in the .json (default = "
+ "'index/').\n"
+ " --override_kv=:"
+ " override a build/search key one or more times multiplying the number of configurations;"
+ " you can use this parameter multiple times to get the Cartesian product of benchmark"
+ " configs.\n"
+ " --mode="
+ " run the benchmarks in latency (accumulate times spent in each batch) or "
+ " throughput (pipeline batches and measure end-to-end) mode\n"
+ " --threads=min[:max] specify the number threads to use for throughput benchmark."
+ " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified,"
+ " then a single test is run with 'min' threads. By default min=1, max=.\n"
+ " --no-lap-sync disable CUDA event synchronization between benchmark iterations. If a GPU"
+ " algorithm has no sync with CPU, this can make the GPU processing significantly lag behind the"
+ " CPU scheduling. Then this also hides the scheduling latencies and thus improves the measured"
+ " throughput (QPS). Note there's a sync at the end of the benchmark loop in any case.\n");
}
template
void register_build(std::shared_ptr> dataset,
std::vector indices,
- bool force_overwrite)
+ bool force_overwrite,
+ bool no_lap_sync)
{
for (auto index : indices) {
auto suf = static_cast(index.build_param["override_suffix"]);
@@ -425,7 +451,7 @@ void register_build(std::shared_ptr> dataset,
std::replace(file_suf.begin(), file_suf.end(), '/', '-');
index.file += file_suf;
auto* b = ::benchmark::RegisterBenchmark(
- index.name + suf, bench_build, dataset, index, force_overwrite);
+ index.name + suf, bench_build, dataset, index, force_overwrite, no_lap_sync);
b->Unit(benchmark::kSecond);
b->MeasureProcessCPUTime();
b->UseRealTime();
@@ -436,14 +462,16 @@ template
void register_search(std::shared_ptr> dataset,
std::vector indices,
Mode metric_objective,
- const std::vector& threads)
+ const std::vector& threads,
+ bool no_lap_sync)
{
for (auto index : indices) {
for (std::size_t i = 0; i < index.search_params.size(); i++) {
auto suf = static_cast(index.search_params[i]["override_suffix"]);
index.search_params[i].erase("override_suffix");
- auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search, index, i, dataset)
+ auto* b = ::benchmark::RegisterBenchmark(
+ index.name + suf, bench_search, index, i, dataset, no_lap_sync)
->Unit(benchmark::kMillisecond)
/**
* The following are important for getting accuracy QPS measurements on both CPU
@@ -470,7 +498,8 @@ void dispatch_benchmark(std::string cmdline,
std::string index_prefix,
kv_series override_kv,
Mode metric_objective,
- const std::vector& threads)
+ const std::vector& threads,
+ bool no_lap_sync)
{
::benchmark::AddCustomContext("command_line", cmdline);
for (auto [key, value] : host_info()) {
@@ -514,7 +543,7 @@ void dispatch_benchmark(std::string cmdline,
more_indices.push_back(modified_index);
}
}
- register_build(dataset, more_indices, force_overwrite);
+ register_build(dataset, more_indices, force_overwrite, no_lap_sync);
} else if (search_mode) {
if (file_exists(query_file)) {
log_info("Using the query file '%s'", query_file.c_str());
@@ -543,7 +572,7 @@ void dispatch_benchmark(std::string cmdline,
index.search_params = apply_overrides(index.search_params, override_kv);
index.file = combine_path(index_prefix, index.file);
}
- register_search(dataset, indices, metric_objective, threads);
+ register_search(dataset, indices, metric_objective, threads, no_lap_sync);
}
}
@@ -571,6 +600,7 @@ inline auto run_main(int argc, char** argv) -> int
bool force_overwrite = false;
bool build_mode = false;
bool search_mode = false;
+ bool no_lap_sync = false;
std::string data_prefix = "data";
std::string index_prefix = "index";
std::string new_override_kv = "";
@@ -604,6 +634,7 @@ inline auto run_main(int argc, char** argv) -> int
if (parse_bool_flag(argv[i], "--force", force_overwrite) ||
parse_bool_flag(argv[i], "--build", build_mode) ||
parse_bool_flag(argv[i], "--search", search_mode) ||
+ parse_bool_flag(argv[i], "--no-lap-sync", no_lap_sync) ||
parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
parse_string_flag(argv[i], "--mode", mode) ||
@@ -686,7 +717,8 @@ inline auto run_main(int argc, char** argv) -> int
index_prefix,
override_kv,
metric_objective,
- threads);
+ threads,
+ no_lap_sync);
} else if (dtype == "half") {
dispatch_benchmark(cmdline,
conf,
@@ -697,7 +729,8 @@ inline auto run_main(int argc, char** argv) -> int
index_prefix,
override_kv,
metric_objective,
- threads);
+ threads,
+ no_lap_sync);
} else if (dtype == "uint8") {
dispatch_benchmark(cmdline,
conf,
@@ -708,7 +741,8 @@ inline auto run_main(int argc, char** argv) -> int
index_prefix,
override_kv,
metric_objective,
- threads);
+ threads,
+ no_lap_sync);
} else if (dtype == "int8") {
dispatch_benchmark(cmdline,
conf,
@@ -719,7 +753,8 @@ inline auto run_main(int argc, char** argv) -> int
index_prefix,
override_kv,
metric_objective,
- threads);
+ threads,
+ no_lap_sync);
} else {
log_error("datatype '%s' is not supported", dtype.c_str());
return -1;
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index c3db2bb4b..dbde74ccc 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -18,7 +18,8 @@
#include "ann_types.hpp"
#include "cuda_stub.hpp" // cuda-related utils
-#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+#if __has_include()
+#define ANN_BENCH_NVTX3_HEADERS_FOUND
#include
#endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 57d5b1910..7617bfa66 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -56,6 +56,26 @@ extern template class cuvs::bench::cuvs_cagra;
#include "cuvs_mg_cagra_wrapper.h"
#endif
+template
+void parse_dynamic_batching_params(const nlohmann::json& conf, ParamT& param)
+{
+ if (!conf.value("dynamic_batching", false)) { return; }
+ param.dynamic_batching = true;
+ if (conf.contains("dynamic_batching_max_batch_size")) {
+ param.dynamic_batching_max_batch_size = conf.at("dynamic_batching_max_batch_size");
+ }
+ param.dynamic_batching_conservative_dispatch =
+ conf.value("dynamic_batching_conservative_dispatch", false);
+ if (conf.contains("dynamic_batching_dispatch_timeout_ms")) {
+ param.dynamic_batching_dispatch_timeout_ms = conf.at("dynamic_batching_dispatch_timeout_ms");
+ }
+ if (conf.contains("dynamic_batching_n_queues")) {
+ param.dynamic_batching_n_queues = conf.at("dynamic_batching_n_queues");
+ }
+ param.dynamic_batching_k =
+ uint32_t(uint32_t(conf.at("k")) * float(conf.value("refine_ratio", 1.0f)));
+}
+
#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
template
void parse_build_param(const nlohmann::json& conf,
@@ -138,6 +158,9 @@ void parse_search_param(const nlohmann::json& conf,
param.refine_ratio = conf.at("refine_ratio");
if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
}
+
+ // enable dynamic batching
+ parse_dynamic_batching_params(conf, param);
}
#endif
@@ -291,5 +314,8 @@ void parse_search_param(const nlohmann::json& conf,
}
// Same ratio as in IVF-PQ
param.refine_ratio = conf.value("refine_ratio", 1.0f);
+
+ // enable dynamic batching
+ parse_dynamic_batching_params(conf, param);
}
#endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index 558ba01e0..e45a3bd5a 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -24,12 +24,35 @@
namespace cuvs::bench {
+template
+void parse_build_param(const nlohmann::json& conf,
+ typename cuvs::bench::cuvs_cagra_hnswlib::build_param& param)
+{
+ if (conf.contains("hierarchy")) {
+ if (conf.at("hierarchy") == "none") {
+ param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::NONE;
+ } else if (conf.at("hierarchy") == "cpu") {
+ param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
+ } else {
+ THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get().c_str());
+ }
+ }
+ if (conf.contains("ef_construction")) {
+ param.hnsw_index_params.ef_construction = conf.at("ef_construction");
+ }
+ if (conf.contains("num_threads")) {
+ param.hnsw_index_params.num_threads = conf.at("num_threads");
+ }
+}
+
template
void parse_search_param(const nlohmann::json& conf,
typename cuvs::bench::cuvs_cagra_hnswlib::search_param& param)
{
- param.ef = conf.at("ef");
- if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+ param.hnsw_search_param.ef = conf.at("ef");
+ if (conf.contains("num_threads")) {
+ param.hnsw_search_param.num_threads = conf.at("num_threads");
+ }
}
template
@@ -43,9 +66,10 @@ auto create_algo(const std::string& algo_name,
if constexpr (std::is_same_v or std::is_same_v) {
if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") {
- typename cuvs::bench::cuvs_cagra_hnswlib::build_param param;
- parse_build_param(conf, param);
- a = std::make_unique>(metric, dim, param);
+ typename cuvs::bench::cuvs_cagra_hnswlib::build_param bparam;
+ ::parse_build_param(conf, bparam.cagra_build_param);
+ parse_build_param(conf, bparam);
+ a = std::make_unique>(metric, dim, bparam);
}
}
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index 875fe0bba..e4169f6f8 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -15,8 +15,8 @@
*/
#pragma once
-#include "../hnswlib/hnswlib_wrapper.h"
#include "cuvs_cagra_wrapper.h"
+#include
#include
@@ -26,14 +26,20 @@ template
class cuvs_cagra_hnswlib : public algo, public algo_gpu {
public:
using search_param_base = typename algo::search_param;
- using build_param = typename cuvs_cagra::build_param;
- using search_param = typename hnsw_lib::search_param;
+
+ struct build_param {
+ typename cuvs_cagra::build_param cagra_build_param;
+ cuvs::neighbors::hnsw::index_params hnsw_index_params;
+ };
+
+ struct search_param : public search_param_base {
+ cuvs::neighbors::hnsw::search_params hnsw_search_param;
+ };
cuvs_cagra_hnswlib(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
: algo(metric, dim),
- cagra_build_{metric, dim, param, concurrent_searches},
- // hnsw_lib param values don't matter since we don't build with hnsw_lib
- hnswlib_search_{metric, dim, typename hnsw_lib::build_param{50, 100}}
+ build_param_{param},
+ cagra_build_{metric, dim, param.cagra_build_param, concurrent_searches}
{
}
@@ -69,40 +75,67 @@ class cuvs_cagra_hnswlib : public algo, public algo_gpu {
}
private:
+ raft::resources handle_{};
+ build_param build_param_;
+ search_param search_param_;
cuvs_cagra cagra_build_;
- hnsw_lib hnswlib_search_;
+ std::shared_ptr> hnsw_index_;
};
template
void cuvs_cagra_hnswlib::build(const T* dataset, size_t nrow)
{
cagra_build_.build(dataset, nrow);
+ auto* cagra_index = cagra_build_.get_index();
+ auto host_dataset_view = raft::make_host_matrix_view(dataset, nrow, this->dim_);
+ auto opt_dataset_view =
+ std::optional>(std::move(host_dataset_view));
+ hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
+ handle_, build_param_.hnsw_index_params, *cagra_index, opt_dataset_view);
}
template
void cuvs_cagra_hnswlib::set_search_param(const search_param_base& param_)
{
- hnswlib_search_.set_search_param(param_);
+ search_param_ = dynamic_cast(param_);
}
template
void cuvs_cagra_hnswlib::save(const std::string& file) const
{
- cagra_build_.save_to_hnswlib(file);
+ cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get()));
}
template
void cuvs_cagra_hnswlib::load(const std::string& file)
{
- hnswlib_search_.load(file);
- hnswlib_search_.set_base_layer_only();
+ cuvs::neighbors::hnsw::index* idx = nullptr;
+ cuvs::neighbors::hnsw::deserialize(handle_,
+ build_param_.hnsw_index_params,
+ file,
+ this->dim_,
+ parse_metric_type(this->metric_),
+ &idx);
+ hnsw_index_ = std::shared_ptr>(idx);
}
template
void cuvs_cagra_hnswlib::search(
const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
{
- hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
+ // Only Latency mode is supported for now
+ auto queries_view =
+ raft::make_host_matrix_view(queries, batch_size, this->dim_);
+ auto neighbors_view = raft::make_host_matrix_view(
+ reinterpret_cast(neighbors), batch_size, k);
+ auto distances_view = raft::make_host_matrix_view(distances, batch_size, k);
+
+ cuvs::neighbors::hnsw::search(handle_,
+ search_param_.hnsw_search_param,
+ *(hnsw_index_.get()),
+ queries_view,
+ neighbors_view,
+ distances_view);
}
} // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
index b2ba35eee..8c9cb2d4f 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -24,6 +24,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -63,6 +64,13 @@ class cuvs_cagra : public algo, public algo_gpu {
AllocatorType graph_mem = AllocatorType::kDevice;
AllocatorType dataset_mem = AllocatorType::kDevice;
[[nodiscard]] auto needs_dataset() const -> bool override { return true; }
+ /* Dynamic batching */
+ bool dynamic_batching = false;
+ int64_t dynamic_batching_k;
+ int64_t dynamic_batching_max_batch_size = 4;
+ double dynamic_batching_dispatch_timeout_ms = 0.01;
+ size_t dynamic_batching_n_queues = 8;
+ bool dynamic_batching_conservative_dispatch = false;
};
struct build_param {
@@ -154,6 +162,8 @@ class cuvs_cagra : public algo, public algo_gpu {
void save_to_hnswlib(const std::string& file) const;
std::unique_ptr> copy() override;
+ auto get_index() const -> const cuvs::neighbors::cagra::index* { return index_.get(); }
+
private:
// handle_ must go first to make sure it dies last and all memory allocated in pool
configured_raft_resources handle_{};
@@ -171,6 +181,12 @@ class cuvs_cagra : public algo, public algo_gpu {
std::shared_ptr> dataset_;
std::shared_ptr> input_dataset_v_;
+ std::shared_ptr> dynamic_batcher_;
+ cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
+ int64_t dynamic_batching_max_batch_size_;
+ size_t dynamic_batching_n_queues_;
+ bool dynamic_batching_conservative_dispatch_;
+
inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
{
switch (mem_type) {
@@ -214,26 +230,33 @@ inline auto allocator_to_string(AllocatorType mem_type) -> std::string
template
void cuvs_cagra::set_search_param(const search_param_base& param)
{
- auto sp = dynamic_cast(param);
- search_params_ = sp.p;
- refine_ratio_ = sp.refine_ratio;
+ auto sp = dynamic_cast(param);
+ bool needs_dynamic_batcher_update =
+ (dynamic_batching_max_batch_size_ != sp.dynamic_batching_max_batch_size) ||
+ (dynamic_batching_n_queues_ != sp.dynamic_batching_n_queues) ||
+ (dynamic_batching_conservative_dispatch_ != sp.dynamic_batching_conservative_dispatch);
+ dynamic_batching_max_batch_size_ = sp.dynamic_batching_max_batch_size;
+ dynamic_batching_n_queues_ = sp.dynamic_batching_n_queues;
+ dynamic_batching_conservative_dispatch_ = sp.dynamic_batching_conservative_dispatch;
+ search_params_ = sp.p;
+ refine_ratio_ = sp.refine_ratio;
if (sp.graph_mem != graph_mem_) {
// Move graph to correct memory space
graph_mem_ = sp.graph_mem;
RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str());
// We create a new graph and copy to it from existing graph
- auto mr = get_mr(graph_mem_);
- auto new_graph = raft::make_device_mdarray(
+ auto mr = get_mr(graph_mem_);
+ *graph_ = raft::make_device_mdarray(
handle_, mr, raft::make_extents(index_->graph().extent(0), index_->graph_degree()));
- raft::copy(new_graph.data_handle(),
+ raft::copy(graph_->data_handle(),
index_->graph().data_handle(),
index_->graph().size(),
raft::resource::get_cuda_stream(handle_));
- index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
- // update_graph() only stores a view in the index. We need to keep the graph object alive.
- *graph_ = std::move(new_graph);
+ // NB: update_graph() only stores a view in the index. We need to keep the graph object alive.
+ index_->update_graph(handle_, make_const_mdspan(graph_->view()));
+ needs_dynamic_batcher_update = true;
}
if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) {
@@ -254,7 +277,26 @@ void cuvs_cagra::set_search_param(const search_param_base& param)
dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1));
index_->update_dataset(handle_, dataset_view);
- need_dataset_update_ = false;
+ need_dataset_update_ = false;
+ needs_dynamic_batcher_update = true;
+ }
+
+ // dynamic batching
+ if (sp.dynamic_batching) {
+ if (!dynamic_batcher_ || needs_dynamic_batcher_update) {
+ dynamic_batcher_ = std::make_shared>(
+ handle_,
+ cuvs::neighbors::dynamic_batching::index_params{{},
+ sp.dynamic_batching_k,
+ sp.dynamic_batching_max_batch_size,
+ sp.dynamic_batching_n_queues,
+ sp.dynamic_batching_conservative_dispatch},
+ *index_,
+ search_params_);
+ }
+ dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+ } else {
+ if (dynamic_batcher_) { dynamic_batcher_.reset(); }
}
}
@@ -304,7 +346,7 @@ void cuvs_cagra::load(const std::string& file)
template
std::unique_ptr> cuvs_cagra::copy()
{
- return std::make_unique>(*this); // use copy constructor
+ return std::make_unique>(std::cref(*this)); // use copy constructor
}
template
@@ -328,8 +370,17 @@ void cuvs_cagra::search_base(const T* queries,
raft::make_device_matrix_view(neighbors_idx_t, batch_size, k);
auto distances_view = raft::make_device_matrix_view(distances, batch_size, k);
- cuvs::neighbors::cagra::search(
- handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+ if (dynamic_batcher_) {
+ cuvs::neighbors::dynamic_batching::search(handle_,
+ dynamic_batcher_sp_,
+ *dynamic_batcher_,
+ queries_view,
+ neighbors_view,
+ distances_view);
+ } else {
+ cuvs::neighbors::cagra::search(
+ handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+ }
if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
if (raft::get_device_for_address(neighbors) < 0 &&
@@ -365,11 +416,23 @@ void cuvs_cagra::search(
const raft::resources& res = handle_;
auto mem_type =
raft::get_device_for_address(neighbors) >= 0 ? MemoryType::kDevice : MemoryType::kHostPinned;
- auto& tmp_buf = get_tmp_buffer_from_global_pool(
- ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
- (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
- batch_size * k0);
- auto* candidates_ptr = reinterpret_cast(tmp_buf.data(mem_type));
+
+ // If dynamic batching is used and there's no sync between benchmark laps, multiple sequential
+ // requests can group together. The data is copied asynchronously, and if the same intermediate
+ // buffer is used for multiple requests, they can override each other's data. Hence, we need to
+ // allocate as much space as required by the maximum number of sequential requests.
+ auto max_dyn_grouping = dynamic_batcher_ ? raft::div_rounding_up_safe(
+ dynamic_batching_max_batch_size_, batch_size) *
+ dynamic_batching_n_queues_
+ : 1;
+ auto tmp_buf_size = ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
+ (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
+ batch_size * k0;
+ auto& tmp_buf = get_tmp_buffer_from_global_pool(tmp_buf_size * max_dyn_grouping);
+ thread_local static int64_t group_id = 0;
+ auto* candidates_ptr = reinterpret_cast(
+ reinterpret_cast(tmp_buf.data(mem_type)) + tmp_buf_size * group_id);
+ group_id = (group_id + 1) % max_dyn_grouping;
auto* candidate_dists_ptr =
reinterpret_cast(candidates_ptr + (disable_refinement ? 0 : batch_size * k0));
auto* neighbors_idx_t =
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
index 4c8a91f23..dac766669 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
@@ -19,7 +19,9 @@
#include "cuvs_ann_bench_utils.h"
#include
+#include
#include
+
#include
#include
#include
@@ -46,6 +48,13 @@ class cuvs_ivf_pq : public algo, public algo_gpu {
cuvs::neighbors::ivf_pq::search_params pq_param;
float refine_ratio = 1.0f;
[[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
+ /* Dynamic batching */
+ bool dynamic_batching = false;
+ int64_t dynamic_batching_k;
+ int64_t dynamic_batching_max_batch_size = 128;
+ double dynamic_batching_dispatch_timeout_ms = 0.01;
+ size_t dynamic_batching_n_queues = 3;
+ bool dynamic_batching_conservative_dispatch = true;
};
using build_param = cuvs::neighbors::ivf_pq::index_params;
@@ -98,6 +107,9 @@ class cuvs_ivf_pq : public algo, public algo_gpu {
int dimension_;
float refine_ratio_ = 1.0;
raft::device_matrix_view dataset_;
+
+ std::shared_ptr> dynamic_batcher_;
+ cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
};
template
@@ -138,6 +150,21 @@ void cuvs_ivf_pq::set_search_param(const search_param_base& param)
search_params_ = sp.pq_param;
refine_ratio_ = sp.refine_ratio;
assert(search_params_.n_probes <= index_params_.n_lists);
+
+ if (sp.dynamic_batching) {
+ dynamic_batcher_ = std::make_shared>(
+ handle_,
+ cuvs::neighbors::dynamic_batching::index_params{{},
+ sp.dynamic_batching_k,
+ sp.dynamic_batching_max_batch_size,
+ sp.dynamic_batching_n_queues,
+ sp.dynamic_batching_conservative_dispatch},
+ *index_,
+ search_params_);
+ dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+ } else {
+ dynamic_batcher_.reset();
+ }
}
template
@@ -168,8 +195,17 @@ void cuvs_ivf_pq::search_base(
raft::make_device_matrix_view(neighbors_idx_t, batch_size, k);
auto distances_view = raft::make_device_matrix_view(distances, batch_size, k);
- cuvs::neighbors::ivf_pq::search(
- handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+ if (dynamic_batcher_) {
+ cuvs::neighbors::dynamic_batching::search(handle_,
+ dynamic_batcher_sp_,
+ *dynamic_batcher_,
+ queries_view,
+ neighbors_view,
+ distances_view);
+ } else {
+ cuvs::neighbors::ivf_pq::search(
+ handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+ }
if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
raft::linalg::unaryOp(neighbors,
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index 755c7c8d6..6e219d2a7 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -33,7 +33,7 @@ void parse_build_param(const nlohmann::json& conf,
{
param.ef_construction = conf.at("efConstruction");
param.m = conf.at("M");
- if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+ if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
}
template
@@ -41,7 +41,7 @@ void parse_search_param(const nlohmann::json& conf,
typename cuvs::bench::hnsw_lib::search_param& param)
{
param.ef = conf.at("ef");
- if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+ if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
}
template class Algo>
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 74da25660..3e91d9995 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -22,8 +22,12 @@ endif()
# Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
# clang)
if(CMAKE_COMPILER_IS_GNUCXX)
- list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
- list(APPEND CUVS_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+ list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations
+ -Wno-reorder
+ )
+ list(APPEND CUVS_CUDA_FLAGS
+ -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations,-Wno-reorder
+ )
# set warnings as errors
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
diff --git a/cpp/cmake/patches/cutlass/build-export.patch b/cpp/cmake/patches/cutlass/build-export.patch
new file mode 100644
index 000000000..a6423e9c0
--- /dev/null
+++ b/cpp/cmake/patches/cutlass/build-export.patch
@@ -0,0 +1,27 @@
+From e0a9597946257a01ae8444200f836ee51d5597ba Mon Sep 17 00:00:00 2001
+From: Kyle Edwards
+Date: Wed, 20 Nov 2024 16:37:38 -0500
+Subject: [PATCH] Remove erroneous include directories
+
+These directories are left over from when CuTe was a separate
+CMake project. Remove them.
+---
+ CMakeLists.txt | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 7419bdf5e..545384d82 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -665,8 +665,6 @@ target_include_directories(
+ $
+ $
+ $
+- $
+- $
+ )
+
+ # Mark CTK headers as system to supress warnings from them
+--
+2.34.1
+
diff --git a/cpp/cmake/patches/cutlass_override.json b/cpp/cmake/patches/cutlass_override.json
new file mode 100644
index 000000000..7bf818987
--- /dev/null
+++ b/cpp/cmake/patches/cutlass_override.json
@@ -0,0 +1,16 @@
+{
+ "packages" : {
+ "cutlass" : {
+ "version": "3.5.1",
+ "git_url": "https://github.com/NVIDIA/cutlass.git",
+ "git_tag": "v${version}",
+ "patches" : [
+ {
+ "file" : "${current_json_dir}/cutlass/build-export.patch",
+ "issue" : "Fix build directory export",
+ "fixed_in" : ""
+ }
+ ]
+ }
+ }
+}
diff --git a/cpp/cmake/patches/hnswlib.diff b/cpp/cmake/patches/hnswlib.diff
index e7f89a8cc..f20c27d91 100644
--- a/cpp/cmake/patches/hnswlib.diff
+++ b/cpp/cmake/patches/hnswlib.diff
@@ -1,188 +1,159 @@
+diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h
+index bef0017..0ee7931 100644
--- a/hnswlib/hnswalg.h
+++ b/hnswlib/hnswalg.h
-@@ -3,6 +3,7 @@
- #include "visited_list_pool.h"
- #include "hnswlib.h"
- #include
-+#include
- #include
- #include
- #include
-@@ -16,6 +17,8 @@ namespace hnswlib {
- template
- class HierarchicalNSW : public AlgorithmInterface {
- public:
-+ bool base_layer_only{false};
-+ int num_seeds=32;
- static const tableint max_update_element_locks = 65536;
- HierarchicalNSW(SpaceInterface *s) {
- }
-@@ -56,7 +59,7 @@ namespace hnswlib {
- visited_list_pool_ = new VisitedListPool(1, max_elements);
-
- //initializations for special treatment of the first node
-- enterpoint_node_ = -1;
-+ enterpoint_node_ = std::numeric_limits::max();
- maxlevel_ = -1;
-
- linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
-@@ -527,7 +530,7 @@ namespace hnswlib {
- tableint *datal = (tableint *) (data + 1);
- for (int i = 0; i < size; i++) {
- tableint cand = datal[i];
-- if (cand < 0 || cand > max_elements_)
-+ if (cand > max_elements_)
- throw std::runtime_error("cand error");
- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
-
-@@ -1067,7 +1070,7 @@ namespace hnswlib {
- tableint *datal = (tableint *) (data + 1);
- for (int i = 0; i < size; i++) {
- tableint cand = datal[i];
-- if (cand < 0 || cand > max_elements_)
-+ if (cand > max_elements_)
- throw std::runtime_error("cand error");
- dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
- if (d < curdist) {
-@@ -1119,28 +1122,41 @@ namespace hnswlib {
- tableint currObj = enterpoint_node_;
- dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
-
-- for (int level = maxlevel_; level > 0; level--) {
-- bool changed = true;
-- while (changed) {
-- changed = false;
-- unsigned int *data;
-+ if (base_layer_only) {
-+ // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
-+ for (int i = 0; i < num_seeds; i++) {
-+ tableint obj = i * (max_elements_ / num_seeds);
-+ dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
-+ if (dist < curdist) {
-+ curdist = dist;
-+ currObj = obj;
-+ }
+@@ -16,6 +16,9 @@ typedef unsigned int linklistsizeint;
+ template
+ class HierarchicalNSW : public AlgorithmInterface {
+ public:
++ bool base_layer_only = false;
++ int num_seeds = 32;
++ bool base_layer_init = true;
+ static const tableint MAX_LABEL_OPERATION_LOCKS = 65536;
+ static const unsigned char DELETE_MARK = 0x01;
+
+@@ -1098,7 +1101,7 @@ class HierarchicalNSW : public AlgorithmInterface {
+
+ std::unique_lock lock_el(link_list_locks_[cur_c]);
+ int curlevel = getRandomLevel(mult_);
+- if (level > 0)
++ if (level > -1)
+ curlevel = level;
+
+ element_levels_[cur_c] = curlevel;
+@@ -1116,6 +1119,9 @@ class HierarchicalNSW : public AlgorithmInterface {
+ memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype));
+ memcpy(getDataByInternalId(cur_c), data_point, data_size_);
+
++ if (!base_layer_init && curlevel == 0)
++ return cur_c;
++
+ if (curlevel) {
+ linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1);
+ if (linkLists_[cur_c] == nullptr)
+@@ -1138,7 +1144,7 @@ class HierarchicalNSW : public AlgorithmInterface {
+ tableint *datal = (tableint *) (data + 1);
+ for (int i = 0; i < size; i++) {
+ tableint cand = datal[i];
+- if (cand < 0 || cand > max_elements_)
++ if (static_cast(cand) < 0 || cand > max_elements_)
+ throw std::runtime_error("cand error");
+ dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
+ if (d < curdist) {
+@@ -1188,28 +1194,41 @@ class HierarchicalNSW : public AlgorithmInterface {
+ tableint currObj = enterpoint_node_;
+ dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+
+- for (int level = maxlevel_; level > 0; level--) {
+- bool changed = true;
+- while (changed) {
+- changed = false;
+- unsigned int *data;
++ if (base_layer_only) {
++ // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
++ for (int i = 0; i < num_seeds; i++) {
++ tableint obj = i * (max_elements_ / num_seeds);
++ dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
++ if (dist < curdist) {
++ curdist = dist;
++ currObj = obj;
+ }
+ }
-+ else{
-+ for (int level = maxlevel_; level > 0; level--) {
-+ bool changed = true;
-+ while (changed) {
-+ changed = false;
-+ unsigned int *data;
-
-- data = (unsigned int *) get_linklist(currObj, level);
-- int size = getListCount(data);
-- metric_hops++;
-- metric_distance_computations+=size;
-+ data = (unsigned int *) get_linklist(currObj, level);
-+ int size = getListCount(data);
-+ metric_hops++;
-+ metric_distance_computations+=size;
-
-- tableint *datal = (tableint *) (data + 1);
-- for (int i = 0; i < size; i++) {
-- tableint cand = datal[i];
-- if (cand < 0 || cand > max_elements_)
-- throw std::runtime_error("cand error");
-- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
-+ tableint *datal = (tableint *) (data + 1);
-+ for (int i = 0; i < size; i++) {
-+ tableint cand = datal[i];
-+ if (cand > max_elements_)
-+ throw std::runtime_error("cand error");
-+ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
-
-- if (d < curdist) {
-- curdist = d;
-- currObj = cand;
-- changed = true;
-+ if (d < curdist) {
-+ curdist = d;
-+ currObj = cand;
-+ changed = true;
-+ }
- }
++ }
++ else {
++ for (int level = maxlevel_; level > 0; level--) {
++ bool changed = true;
++ while (changed) {
++ changed = false;
++ unsigned int *data;
+
+- data = (unsigned int *) get_linklist(currObj, level);
+- int size = getListCount(data);
+- metric_hops++;
+- metric_distance_computations+=size;
++ data = (unsigned int *) get_linklist(currObj, level);
++ int size = getListCount(data);
++ metric_hops++;
++ metric_distance_computations+=size;
++
++ tableint *datal = (tableint *) (data + 1);
++ for (int i = 0; i < size; i++) {
++ tableint cand = datal[i];
++ if (static_cast(cand) < 0 || cand > max_elements_)
++ throw std::runtime_error("cand error");
++ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+
+- tableint *datal = (tableint *) (data + 1);
+- for (int i = 0; i < size; i++) {
+- tableint cand = datal[i];
+- if (cand < 0 || cand > max_elements_)
+- throw std::runtime_error("cand error");
+- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+-
+- if (d < curdist) {
+- curdist = d;
+- currObj = cand;
+- changed = true;
++ if (d < curdist) {
++ curdist = d;
++ currObj = cand;
++ changed = true;
++ }
}
}
+ }
diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
-index 4413537..c3240f3 100644
+index 834d19f..0c0af26 100644
--- a/hnswlib/space_l2.h
+++ b/hnswlib/space_l2.h
-@@ -252,13 +252,14 @@ namespace hnswlib {
- ~L2Space() {}
- };
-
-+ template
- static int
- L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
-
- size_t qty = *((size_t *) qty_ptr);
- int res = 0;
-- unsigned char *a = (unsigned char *) pVect1;
-- unsigned char *b = (unsigned char *) pVect2;
-+ T *a = (T *) pVect1;
-+ T *b = (T *) pVect2;
-
- qty = qty >> 2;
- for (size_t i = 0; i < qty; i++) {
-@@ -279,11 +280,12 @@ namespace hnswlib {
- return (res);
- }
-
-+ template
- static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
- size_t qty = *((size_t*)qty_ptr);
- int res = 0;
-- unsigned char* a = (unsigned char*)pVect1;
-- unsigned char* b = (unsigned char*)pVect2;
-+ T* a = (T*)pVect1;
-+ T* b = (T*)pVect2;
-
- for(size_t i = 0; i < qty; i++)
- {
-@@ -294,6 +296,7 @@ namespace hnswlib {
- return (res);
- }
-
-+ template
- class L2SpaceI : public SpaceInterface {
-
- DISTFUNC fstdistfunc_;
-@@ -302,10 +305,10 @@ namespace hnswlib {
- public:
- L2SpaceI(size_t dim) {
- if(dim % 4 == 0) {
-- fstdistfunc_ = L2SqrI4x;
-+ fstdistfunc_ = L2SqrI4x;
- }
- else {
-- fstdistfunc_ = L2SqrI;
-+ fstdistfunc_ = L2SqrI;
- }
- dim_ = dim;
- data_size_ = dim * sizeof(unsigned char);
-diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h
-index 5e1a4a5..4195ebd 100644
---- a/hnswlib/visited_list_pool.h
-+++ b/hnswlib/visited_list_pool.h
-@@ -3,6 +3,7 @@
- #include
- #include
- #include
-+#include
-
- namespace hnswlib {
- typedef unsigned short int vl_type;
-@@ -14,7 +15,7 @@ namespace hnswlib {
- unsigned int numelements;
-
- VisitedList(int numelements1) {
-- curV = -1;
-+ curV = std::numeric_limits::max();
- numelements = numelements1;
- mass = new vl_type[numelements];
+@@ -252,12 +252,13 @@ class L2Space : public SpaceInterface {
+ ~L2Space() {}
+ };
+
++template
+ static int
+ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
+ size_t qty = *((size_t *) qty_ptr);
+ int res = 0;
+- unsigned char *a = (unsigned char *) pVect1;
+- unsigned char *b = (unsigned char *) pVect2;
++ T *a = (T *) pVect1;
++ T *b = (T *) pVect2;
+
+ qty = qty >> 2;
+ for (size_t i = 0; i < qty; i++) {
+@@ -277,11 +278,12 @@ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const voi
+ return (res);
+ }
+
++template
+ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
+ size_t qty = *((size_t*)qty_ptr);
+ int res = 0;
+- unsigned char* a = (unsigned char*)pVect1;
+- unsigned char* b = (unsigned char*)pVect2;
++ T* a = (T*)pVect1;
++ T* b = (T*)pVect2;
+
+ for (size_t i = 0; i < qty; i++) {
+ res += ((*a) - (*b)) * ((*a) - (*b));
+@@ -291,6 +293,7 @@ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2,
+ return (res);
+ }
+
++template
+ class L2SpaceI : public SpaceInterface {
+ DISTFUNC fstdistfunc_;
+ size_t data_size_;
+@@ -299,9 +302,9 @@ class L2SpaceI : public SpaceInterface {
+ public:
+ L2SpaceI(size_t dim) {
+ if (dim % 4 == 0) {
+- fstdistfunc_ = L2SqrI4x;
++ fstdistfunc_ = L2SqrI4x;
+ } else {
+- fstdistfunc_ = L2SqrI;
++ fstdistfunc_ = L2SqrI;
}
---
-2.43.0
-
+ dim_ = dim;
+ data_size_ = dim * sizeof(unsigned char);
diff --git a/cpp/cmake/patches/hnswlib_override.json b/cpp/cmake/patches/hnswlib_override.json
index aef2da772..c50220e24 100644
--- a/cpp/cmake/patches/hnswlib_override.json
+++ b/cpp/cmake/patches/hnswlib_override.json
@@ -1,16 +1,16 @@
{
- "packages" : {
- "hnswlib" : {
- "version": "0.6.2",
- "git_url": "https://github.com/nmslib/hnswlib.git",
- "git_tag": "v${version}",
- "patches" : [
- {
- "file" : "${current_json_dir}/hnswlib.diff",
- "issue" : "Correct compilation issues",
- "fixed_in" : ""
- }
- ]
- }
+ "packages": {
+ "hnswlib": {
+ "version": "0.7.0",
+ "git_url": "https://github.com/nmslib/hnswlib.git",
+ "git_tag": "v${version}",
+ "patches": [
+ {
+ "file": "${current_json_dir}/hnswlib.diff",
+ "issue": "Correct compilation issues",
+ "fixed_in": ""
+ }
+ ]
}
- }
\ No newline at end of file
+ }
+}
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake
index 61065318b..71bd2d26c 100644
--- a/cpp/cmake/thirdparty/get_cutlass.cmake
+++ b/cpp/cmake/thirdparty/get_cutlass.cmake
@@ -13,10 +13,11 @@
# =============================================================================
function(find_and_configure_cutlass)
- set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
+ set(options)
+ set(oneValueArgs)
+ set(multiValueArgs)
cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
- # if(RAFT_ENABLE_DIST_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES)
set(CUTLASS_ENABLE_HEADERS_ONLY
ON
CACHE BOOL "Enable only the header library"
@@ -34,13 +35,22 @@ function(find_and_configure_cutlass)
set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE)
endif()
+ include("${rapids-cmake-dir}/cpm/package_override.cmake")
+ rapids_cpm_package_override("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches/cutlass_override.json")
+
+ include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+ rapids_cpm_package_details(cutlass version repository tag shallow exclude)
+
+ include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+ rapids_cpm_generate_patch_command(cutlass ${version} patch_command)
+
rapids_cpm_find(
- NvidiaCutlass ${PKG_VERSION}
+ NvidiaCutlass ${version}
GLOBAL_TARGETS nvidia::cutlass::cutlass
CPM_ARGS
- GIT_REPOSITORY ${PKG_REPOSITORY}
- GIT_TAG ${PKG_PINNED_TAG}
- GIT_SHALLOW TRUE
+ GIT_REPOSITORY ${repository}
+ GIT_TAG ${tag}
+ GIT_SHALLOW ${shallow} ${patch_command}
OPTIONS "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
)
@@ -56,7 +66,6 @@ function(find_and_configure_cutlass)
NAMESPACE nvidia::cutlass::
)
endif()
- # endif()
# We generate the cutlass-config files when we built cutlass locally, so always do
# `find_dependency`
@@ -79,14 +88,4 @@ function(find_and_configure_cutlass)
)
endfunction()
-if(NOT RAFT_CUTLASS_GIT_TAG)
- set(RAFT_CUTLASS_GIT_TAG v2.10.0)
-endif()
-
-if(NOT RAFT_CUTLASS_GIT_REPOSITORY)
- set(RAFT_CUTLASS_GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git)
-endif()
-
-find_and_configure_cutlass(
- VERSION 2.10.0 REPOSITORY ${RAFT_CUTLASS_GIT_REPOSITORY} PINNED_TAG ${RAFT_CUTLASS_GIT_TAG}
-)
+find_and_configure_cutlass()
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 2e6c895e5..5b4d89aa2 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -15,6 +15,7 @@
#=============================================================================
function(find_and_configure_hnswlib)
+ message(STATUS "Finding or building hnswlib")
set(oneValueArgs)
include(${rapids-cmake-dir}/cpm/package_override.cmake)
diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp
index e1da04085..8f7e8675a 100644
--- a/cpp/include/cuvs/cluster/agglomerative.hpp
+++ b/cpp/include/cuvs/cluster/agglomerative.hpp
@@ -18,6 +18,7 @@
#include
#include
+
#include
#include
diff --git a/cpp/include/cuvs/core/c_api.h b/cpp/include/cuvs/core/c_api.h
index c8c8d3934..400d162ad 100644
--- a/cpp/include/cuvs/core/c_api.h
+++ b/cpp/include/cuvs/core/c_api.h
@@ -151,6 +151,22 @@ cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
*/
cuvsError_t cuvsRMMMemoryResourceReset();
+/**
+ * @brief Allocates pinned memory on the host using RMM
+ * @param[out] ptr Pointer to allocated host memory
+ * @param[in] bytes Size in bytes to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes);
+
+/**
+ * @brief Deallocates pinned memory on the host using RMM
+ * @param[in] ptr Pointer to allocated host memory to free
+ * @param[in] bytes Size in bytes to deallocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes);
+
/** @} */
#ifdef __cplusplus
diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp
index def72641e..42c574e58 100644
--- a/cpp/include/cuvs/distance/distance.hpp
+++ b/cpp/include/cuvs/distance/distance.hpp
@@ -20,6 +20,7 @@
#include
#include
+#include
#include
#include
@@ -331,6 +332,86 @@ void pairwise_distance(
cuvs::distance::DistanceType metric,
float metric_arg = 2.0f);
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include
+ * #include
+ * #include
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+ raft::device_csr_matrix_view x,
+ raft::device_csr_matrix_view y,
+ raft::device_matrix_view dist,
+ cuvs::distance::DistanceType metric,
+ float metric_arg = 2.0f);
+
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include
+ * #include
+ * #include
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+ raft::device_csr_matrix_view x,
+ raft::device_csr_matrix_view y,
+ raft::device_matrix_view dist,
+ cuvs::distance::DistanceType metric,
+ float metric_arg = 2.0f);
+
/** @} */ // end group pairwise_distance_runtime
}; // namespace cuvs::distance
diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp
new file mode 100644
index 000000000..0c904d493
--- /dev/null
+++ b/cpp/include/cuvs/distance/grammian.hpp
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include
+#include
+#include
+#include