Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Dask tests with UCX-Py/UCXX in CI #5697

Merged
merged 37 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
a076e1e
Add UCX/UCXX cluster/client fixtures to Dask tests
pentschev Dec 12, 2023
348407e
Add options to select running UCX/UCXX Dask tests
pentschev Dec 12, 2023
d4988c2
Add UCX/UCXX Dask NN tests
pentschev Dec 12, 2023
b88b7bd
Enable UCX/UCXX tests in CI
pentschev Dec 12, 2023
b07197f
Merge remote-tracking branch 'upstream/branch-24.02' into ucxx-tests
pentschev Dec 12, 2023
f60d58a
Fix CI command line typos
pentschev Dec 12, 2023
55ad2da
Merge remote-tracking branch 'upstream/branch-24.02' into ucxx-tests
pentschev Jan 5, 2024
3a06e29
Merge remote-tracking branch 'upstream/branch-24.06' into ucxx-tests
pentschev Apr 9, 2024
c049eb3
Add RAFT Python channel to test before RAFT merges
pentschev Apr 9, 2024
2fa180f
Update copyright
pentschev Apr 9, 2024
b4e22fb
Point CMake to RAFT/cumlprims_mg UCXX PRs
pentschev Apr 9, 2024
cee0d25
Move UCX-Py/UCXX to `run_cuml_dask_pytests.sh`
pentschev Apr 9, 2024
a695651
Disable `test_fil_skl_classification`
pentschev Apr 10, 2024
13f1d4c
Add missing `distributed-ucxx` dependency
pentschev Apr 10, 2024
5b3d468
Adjust `distributed-ucxx` in `build_wheel.sh`
pentschev Apr 10, 2024
8ed59fe
Update `distributed-ucxx` version in release script
pentschev Apr 10, 2024
4e20acf
Merge remote-tracking branch 'upstream/branch-24.06' into ucxx-tests
pentschev Apr 10, 2024
8359624
Merge remote-tracking branch 'upstream/branch-24.06' into ucxx-tests
pentschev May 3, 2024
57c5f4f
Disable non-UCXX Dask wheel tests in CI
pentschev May 3, 2024
6d0819f
Pull wheels from raft-dask PR
pentschev May 3, 2024
b7c31d1
Switch to raft-dask PR also for wheel build
pentschev May 3, 2024
ea613f0
Add set -x
pentschev May 4, 2024
2d39085
Fix `raft_dask_wheelhouse`
pentschev May 6, 2024
764bba9
Undo raft testing changes
vyasr May 7, 2024
f5fd345
Fix copyright
vyasr May 7, 2024
f4d14f4
Merge remote-tracking branch 'upstream/branch-24.06' into ucxx-tests
pentschev May 7, 2024
055beb8
Revert "Add set -x"
pentschev May 7, 2024
2cb76dd
Formatting fix
pentschev May 7, 2024
d28568c
Fix comment
pentschev May 7, 2024
364dcb6
Remove direct `distributed-ucxx` dependency
pentschev May 13, 2024
6be59e8
Re-enable Dask wheel tests
pentschev May 13, 2024
89484c3
Add timeouts to Dask tests
pentschev May 14, 2024
1f29e03
Remove `distributed-ucxx` from build and release scripts
pentschev May 15, 2024
06c6c7a
Remove unnecessary UCX-Py version parsing
pentschev May 15, 2024
13862f2
Remove `--find-links dist/`
pentschev May 15, 2024
31fb4a6
Remove `pytest.ini`
pentschev May 16, 2024
ce9590a
Remove debug print statements
pentschev May 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions ci/build_wheel.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

set -euo pipefail
set -xeuo pipefail

package_name="cuml"
package_dir="python"
Expand All @@ -14,6 +14,8 @@ git_commit=$(git rev-parse HEAD)

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"

raft_dask_wheelhouse=$(RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-get-pr-wheel-artifact raft 1983 python)

vyasr marked this conversation as resolved.
Show resolved Hide resolved
# This is the version of the suffix with a preceding hyphen. It's used
# everywhere except in the final wheel name.
PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
Expand All @@ -34,7 +36,7 @@ if ! rapids-is-release-build; then
alpha_spec=',>=0.0.0a0'
fi

for dep in cudf pylibraft raft-dask rmm; do
for dep in cudf distributed-ucxx pylibraft raft-dask rmm; do
sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
done

Expand All @@ -54,7 +56,8 @@ SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DDISABLE_DEPRECATION_WARNINGS=ON;-DC
-w dist \
-vvv \
--no-deps \
--disable-pip-version-check
--disable-pip-version-check \
--find-links ${raft_dask_wheelhouse}
vyasr marked this conversation as resolved.
Show resolved Hide resolved

mkdir -p final_dist
python -m auditwheel repair -w final_dist dist/*
Expand Down
7 changes: 6 additions & 1 deletion ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
NEXT_UCX_PY_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"

# Need to distutils-normalize the original version
NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))")

echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"

Expand All @@ -43,6 +44,7 @@ sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.
sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml
sed_runner "s/pylibraft==.*\",/pylibraft==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml
sed_runner "s/raft-dask==.*\",/raft-dask==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml
sed_runner "s/distributed-ucxx==.*\",/distributed-ucxx==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml

DEPENDENCIES=(
cudf
Expand All @@ -64,6 +66,9 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do
for DEP in "${DEPENDENCIES[@]}"; do
sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
done
sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
done

sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" README.md
Expand Down
12 changes: 11 additions & 1 deletion ci/run_cuml_dask_pytests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,14 @@
# Support invoking run_cuml_dask_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests/dask

python -m pytest --cache-clear "$@" .
# Disable non-UCXX tests for wheels tests
if [[ -v CONDA_PREFIX ]]; then
rapids-logger "pytest cuml-dask (No UCX-Py/UCXX)"
python -m pytest --cache-clear "$@" .

rapids-logger "pytest cuml-dask (UCX-Py only)"
python -m pytest --cache-clear --run_ucx "$@" .
fi

rapids-logger "pytest cuml-dask (UCXX only)"
python -m pytest --cache-clear --run_ucxx "$@" .
3 changes: 2 additions & 1 deletion ci/test_python_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ set -euo pipefail
rapids-logger "Downloading artifacts from previous jobs"
CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
RAFT_CHANNEL=$(rapids-get-pr-conda-artifact raft 1983 python)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

rapids-logger "Generate Python testing dependencies"
rapids-dependency-file-generator \
--output conda \
--file_key test_python \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \
--prepend-channels "${CPP_CHANNEL};${PYTHON_CHANNEL}" | tee env.yaml
--prepend-channels "${CPP_CHANNEL};${PYTHON_CHANNEL};${RAFT_CHANNEL}" | tee env.yaml
vyasr marked this conversation as resolved.
Show resolved Hide resolved

rapids-mamba-retry env create --yes -f env.yaml -n test

Expand Down
2 changes: 1 addition & 1 deletion ci/test_python_dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ EXITCODE=0
trap "EXITCODE=1" ERR
set +e

rapids-logger "pytest cuml-dask"
# Run tests (no UCX-Py/UCXX)
./ci/run_cuml_dask_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-dask.xml" \
--cov-config=../../../.coveragerc \
Expand Down
3 changes: 2 additions & 1 deletion ci/test_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ set -euo pipefail
mkdir -p ./dist
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cuml_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
raft_dask_wheelhouse=$(RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-get-pr-wheel-artifact raft 1983 python)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

# On arm also need to install CMake because treelite needs to be compiled (no wheels available for arm).
if [[ "$(arch)" == "aarch64" ]]; then
python -m pip install cmake
fi

# echo to expand wildcard before adding `[extra]` requires for pip
python -m pip install $(echo ./dist/cuml*.whl)[test]
python -m pip install $(echo ./dist/cuml*.whl)[test] --find-links dist/ --find-link "${raft_dask_wheelhouse}"
vyasr marked this conversation as resolved.
Show resolved Hide resolved

RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
mkdir -p "${RAPIDS_TESTS_DIR}"
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies:
- dask-cuda==24.6.*
- dask-cudf==24.6.*
- dask-ml
- distributed-ucxx==0.38.*
- doxygen=1.9.1
- gcc_linux-64=11.*
- graphviz
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies:
- dask-cuda==24.6.*
- dask-cudf==24.6.*
- dask-ml
- distributed-ucxx==0.38.*
- doxygen=1.9.1
- gcc_linux-64=11.*
- graphviz
Expand Down
4 changes: 2 additions & 2 deletions cpp/cmake/thirdparty/get_cumlprims_mg.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#=============================================================================
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -64,7 +64,7 @@ endfunction()
###
find_and_configure_cumlprims_mg(VERSION ${CUML_MIN_VERSION_cumlprims_mg}
FORK rapidsai
PINNED_TAG branch-${CUML_BRANCH_VERSION_cumlprims_mg}
PINNED_TAG pull-request/186
vyasr marked this conversation as resolved.
Show resolved Hide resolved
BUILD_STATIC ${CUML_USE_CUMLPRIMS_MG_STATIC}
EXCLUDE_FROM_ALL ${CUML_EXCLUDE_CUMLPRIMS_MG_FROM_ALL}
# When PINNED_TAG above doesn't match cuml,
Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake/thirdparty/get_raft.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ endfunction()
# CPM_raft_SOURCE=/path/to/local/raft
find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft}
FORK rapidsai
PINNED_TAG branch-${CUML_BRANCH_VERSION_raft}
PINNED_TAG pull-request/1983
vyasr marked this conversation as resolved.
Show resolved Hide resolved
EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL}
# When PINNED_TAG above doesn't match cuml,
# force local raft clone in build directory
Expand Down
25 changes: 25 additions & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ files:
- common_build
- cuda
- cuda_version
- depends_on_distributed_ucxx
- docs
- py_build
- py_run
Expand Down Expand Up @@ -54,6 +55,7 @@ files:
output: none
includes:
- cuda_version
- depends_on_distributed_ucxx
- py_version
- test_cuml
- test_python
Expand Down Expand Up @@ -84,6 +86,7 @@ files:
table: project.optional-dependencies
key: test
includes:
- depends_on_distributed_ucxx
- test_python
channels:
- rapidsai
Expand Down Expand Up @@ -441,3 +444,25 @@ dependencies:
- pandas
- *scikit_learn
- seaborn
depends_on_distributed_ucxx:
common:
- output_types: conda
packages:
# UCXX is not currently a hard-dependency thus only installed during tests,
# this will change in the future.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we not rely on raft-dask to pull in the ucxx dependency? cuml doesn't use it directly, only via raft-dask, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My intent was to make ucxx an optional dependency of raft-dask although I now realize that's not the case in https://github.com/rapidsai/raft/blob/branch-24.06/conda/recipes/raft-dask/meta.yaml . I think it's ok to have ucxx (and thus libucxx) pulled in by RAFT -- even though it's technically not a hard dependency because it won't be activated by default -- but it will bloat a bit the installation. What I'm saying is we have two options:

  1. Make ucxx a soft dependency of raft-dask and then leave this here; or
  2. Accept ucxx as a hard dependency of raft-dask and remove this.

Do you have thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just stick with a hard dependency for now. It's easier to manage and that is the long-term plan anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was just thinking a bit more about this and perhaps it should indeed be an optional dependency everywhere, both in raft-dask and cuml. Since UCX is not a requirement to run either one (people may choose NCCL as a replacement, for example) it isn't the case that users always require either UCX-Py (or UCXX).

If it makes things simpler for us for the moment, I think it's ok to make it a hard dependency, but in the long run I don't think it makes sense from the packaging perspective to require UCX-Py/UCXX. WDYT @vyasr ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ultimately raft-dask is going to have to be compiled against libucxx, right? The C++ dependency will be there no matter what, unless you plan to rewrite the raft C++ to use a dlopen of ucxx.

- &distributed_ucxx_conda distributed-ucxx==0.38.*
- output_types: requirements
packages:
# pip recognizes the index as a global option for the requirements.txt file
- --extra-index-url=https://pypi.nvidia.com
- --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
specific:
- output_types: [requirements, pyproject]
matrices:
- matrix: {cuda: "12.*"}
packages:
- distributed-ucxx-cu12==0.38.*
- matrix: {cuda: "11.*"}
packages:
- distributed-ucxx-cu11==0.38.*
- {matrix: null, packages: [*distributed_ucxx_conda]}
71 changes: 60 additions & 11 deletions python/cuml/tests/dask/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import pytest

Expand Down Expand Up @@ -34,18 +34,8 @@ def client(cluster):

@pytest.fixture(scope="module")
def ucx_cluster():
initialize.initialize(
create_cuda_context=True,
enable_tcp_over_ucx=enable_tcp_over_ucx,
enable_nvlink=enable_nvlink,
enable_infiniband=enable_infiniband,
)
cluster = LocalCUDACluster(
protocol="ucx",
enable_tcp_over_ucx=enable_tcp_over_ucx,
enable_nvlink=enable_nvlink,
enable_infiniband=enable_infiniband,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()
Expand All @@ -57,3 +47,62 @@ def ucx_client(ucx_cluster):
client = Client(ucx_cluster)
yield client
client.close()


@pytest.fixture(scope="module")
def ucxx_cluster():
cluster = LocalCUDACluster(
protocol="ucxx",
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()


@pytest.fixture(scope="function")
def ucxx_client(ucxx_cluster):
pytest.importorskip("distributed_ucxx")

client = Client(ucxx_cluster)
yield client
client.close()


def pytest_addoption(parser):
group = parser.getgroup("Dask cuML Custom Options")

group.addoption(
"--run_ucx", action="store_true", help="run _only_ UCX-Py tests"
)

group.addoption(
"--run_ucxx", action="store_true", help="run _only_ UCXX tests"
)


def pytest_collection_modifyitems(config, items):
if config.getoption("--run_ucx"):
skip_others = pytest.mark.skip(
reason="only runs when --run_ucx is not specified"
)
for item in items:
if "ucx" not in item.keywords:
item.add_marker(skip_others)
else:
skip_ucx = pytest.mark.skip(reason="requires --run_ucx to run")
for item in items:
if "ucx" in item.keywords:
item.add_marker(skip_ucx)

if config.getoption("--run_ucxx"):
skip_others = pytest.mark.skip(
reason="only runs when --run_ucxx is not specified"
)
for item in items:
if "ucxx" not in item.keywords:
item.add_marker(skip_others)
else:
skip_ucxx = pytest.mark.skip(reason="requires --run_ucxx to run")
for item in items:
if "ucxx" in item.keywords:
item.add_marker(skip_ucxx)
Loading
Loading