diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index e191e8cf5..94e6e6cb3 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -1,4 +1,4 @@ -# This Dockerfile spins up a container where presubmit tests are run. +# This Dockerfile builds the container from which presubmit tests are run # Cloud Build orchestrates this process. FROM gcr.io/cloud-builders/gcloud @@ -9,8 +9,16 @@ COPY --chown=ia-tests:ia-tests . /init-actions # Install Bazel: # https://docs.bazel.build/versions/master/install-ubuntu.html -RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list -RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add - -RUN apt-get update && apt-get install -y openjdk-8-jdk python3-setuptools bazel +ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg +RUN apt-get install -y -qq curl >/dev/null 2>&1 && \ + apt-get clean +RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | \ + gpg --dearmor -o "${bazel_kr_path}" +RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | \ + dd of=/etc/apt/sources.list.d/bazel.list status=none && \ + apt-get update -qq +RUN apt-get autoremove -y -qq && \ + apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \ + apt-get clean USER ia-tests diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index b09789a52..be06ea5a6 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,6 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + continue # remove this before squash/merge echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh index 810213832..c573fd9a7 100644 --- a/cloudbuild/run-presubmit-on-k8s.sh +++ b/cloudbuild/run-presubmit-on-k8s.sh @@ -12,6 +12,8 @@ gcloud container clusters get-credentials "${CLOUDSDK_CONTAINER_CLUSTER}" LOGS_SINCE_TIME=$(date --iso-8601=seconds) +# This kubectl sometimes fails because services have not caught up. Thread.yield() +sleep 10s kubectl run "${POD_NAME}" \ --image="${IMAGE}" \ --restart=Never \ diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index c2e6577b1..936718498 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -17,7 +17,7 @@ FLAGS = flags.FLAGS flags.DEFINE_string('image', None, 'Dataproc image URL') -flags.DEFINE_string('image_version', None, 'Dataproc image version, e.g. 1.4') +flags.DEFINE_string('image_version', None, 'Dataproc image version, e.g. 2.2') flags.DEFINE_boolean('skip_cleanup', False, 'Skip cleanup of test resources') FLAGS(sys.argv) @@ -122,9 +122,9 @@ def createCluster(self, args.append("--public-ip-address") for i in init_actions: - if "install_gpu_driver.sh" in i or \ - "mlvm.sh" in i or "rapids.sh" in i or \ - "spark-rapids.sh" in i or "horovod.sh" in i: + if "install_gpu_driver.sh" in i or "horovod.sh" in i or \ + "dask-rapids.sh" in i or "mlvm.sh" in i or \ + "spark-rapids.sh" in i: args.append("--no-shielded-secure-boot") if optional_components: @@ -178,11 +178,15 @@ def createCluster(self, args.append("--zone={}".format(self.cluster_zone)) if not FLAGS.skip_cleanup: - args.append("--max-age=2h") + args.append("--max-age=60m") + + args.append("--max-idle=25m") cmd = "{} dataproc clusters create {} {}".format( "gcloud beta" if beta else "gcloud", self.name, " ".join(args)) + print("Running command: [{}]".format(cmd)) + _, stdout, _ = self.assert_command( cmd, timeout_in_minutes=timeout_in_minutes or DEFAULT_TIMEOUT) config = json.loads(stdout).get("config", {}) @@ -239,7 +243,7 @@ def getClusterName(self): @staticmethod def getImageVersion(): - # Get a numeric version from the version flag: '1.5-debian10' -> '1.5'. + # Get a numeric version from the version flag: '2.2-debian10' -> '2.2'. # Special case a 'preview' image versions and return a large number # instead to make it a higher image version in comparisons version = FLAGS.image_version @@ -248,7 +252,7 @@ def getImageVersion(): @staticmethod def getImageOs(): - # Get OS string from the version flag: '1.5-debian10' -> 'debian'. + # Get OS string from the version flag: '2.2-debian10' -> 'debian'. # If image version specified without OS suffix ('2.0') # then return 'debian' by default version = FLAGS.image_version diff --git a/rapids/BUILD b/rapids/BUILD index c4db3e191..c5e2d3569 100644 --- a/rapids/BUILD +++ b/rapids/BUILD @@ -8,8 +8,6 @@ py_test( srcs = ["test_rapids.py"], data = [ "rapids.sh", - "verify_xgboost_spark.scala", - "//dask:dask.sh", "//gpu:install_gpu_driver.sh", ], local = True, diff --git a/rapids/Dockerfile b/rapids/Dockerfile new file mode 100644 index 000000000..ad2c89086 --- /dev/null +++ b/rapids/Dockerfile @@ -0,0 +1,40 @@ +# This Dockerfile builds the container from which rapids tests are run +# This process needs to be executed manually from a git clone +# +# See manual-test-runner.sh for instructions + +FROM gcr.io/cloud-builders/gcloud + +RUN useradd -m -d /home/ia-tests -s /bin/bash ia-tests + +RUN apt-get -qq update \ + && apt-get -y -qq install \ + apt-transport-https apt-utils \ + ca-certificates libmime-base64-perl gnupg \ + curl jq less screen > /dev/null 2>&1 && apt-get clean + +# Install bazel signing key, repo and package +ENV bazel_kr_path=/usr/share/keyrings/bazel-release.pub.gpg +ENV bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" + +RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \ + | gpg --dearmor -o "${bazel_kr_path}" \ + && echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" \ + | dd of=/etc/apt/sources.list.d/bazel.list status=none \ + && apt-get update -qq + +RUN apt-get autoremove -y -qq && \ + apt-get install -y -qq default-jdk python3-setuptools bazel > /dev/null 2>&1 && \ + apt-get clean + + +# Install here any utilities you find useful when troubleshooting +RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-get clean + +WORKDIR /init-actions + +USER ia-tests +COPY --chown=ia-tests:ia-tests . ${WORKDIR} + +ENTRYPOINT ["/bin/bash"] +#CMD ["/bin/bash"] diff --git a/rapids/bazel.screenrc b/rapids/bazel.screenrc new file mode 100644 index 000000000..05d4cb788 --- /dev/null +++ b/rapids/bazel.screenrc @@ -0,0 +1,17 @@ +# +# For debugging, uncomment the following line +# + +# screen -L -t monitor 0 /bin/bash + +screen -L -t 2.0-debian10 1 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-debian10 ; exec /bin/bash' +#screen -L -t 2.0-rocky8 2 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-rocky8 ; exec /bin/bash' +#screen -L -t 2.0-ubuntu18 3 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-ubuntu18 ; exec /bin/bash' + +#screen -L -t 2.1-debian11 4 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-debian11 ; exec /bin/bash' +#screen -L -t 2.1-rocky8 5 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-rocky8 ; exec /bin/bash' +#screen -L -t 2.1-ubuntu20 6 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-ubuntu20 ; exec /bin/bash' + +#screen -L -t 2.2-debian12 7 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-debian12 ; exec /bin/bash' +#screen -L -t 2.2-rocky9 8 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-rocky9 ; exec /bin/bash' +#screen -L -t 2.2-ubuntu22 9 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-ubuntu22 ; exec /bin/bash' diff --git a/rapids/env.json.sample b/rapids/env.json.sample new file mode 100644 index 000000000..00d9fd65c --- /dev/null +++ b/rapids/env.json.sample @@ -0,0 +1,7 @@ +{ + "PROJECT_ID":"example-yyyy-nn", + "PURPOSE":"cuda-pre-init", + "BUCKET":"my-bucket-name", + "IMAGE_VERSION":"2.2-debian12", + "ZONE":"us-west4-ñ" +} diff --git a/rapids/manual-test-runner.sh b/rapids/manual-test-runner.sh new file mode 100644 index 000000000..371917a40 --- /dev/null +++ b/rapids/manual-test-runner.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# This script sets up the gcloud environment and launches tests in a screen session +# +# To run the script, the following will bootstrap +# +# git clone git@github.com:GoogleCloudDataproc/initialization-actions +# git checkout rapids-20240806 +# cd initialization-actions +# cp rapids/env.json.sample env.json +# vi env.json +# docker build -f rapids/Dockerfile -t rapids-init-actions-runner:latest . +# time docker run -it rapids-init-actions-runner:latest rapids/manual-test-runner.sh +# +# The bazel run(s) happen in separate screen windows. +# To see a list of screen windows, press ^a " +# Num Name +# +# 0 monitor +# 1 2.0-debian10 +# 2 sh + + +readonly timestamp="$(date +%F-%H-%M)" +export BUILD_ID="$(uuidgen)" + +tmp_dir="/tmp/${BUILD_ID}" +log_dir="${tmp_dir}/logs" +mkdir -p "${log_dir}" + +IMAGE_VERSION="$1" +if [[ -z "${IMAGE_VERSION}" ]] ; then + IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" ; fi ; export IMAGE_VERSION +export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" +export REGION="$(jq -r .REGION env.json)" +export BUCKET="$(jq -r .BUCKET env.json)" + +gcs_log_dir="gs://${BUCKET}/${BUILD_ID}/logs" + +function exit_handler() { + RED='\\e[0;31m' + GREEN='\\e[0;32m' + NC='\\e[0m' + echo 'Cleaning up before exiting.' + + # TODO: list clusters which match our BUILD_ID and clean them up + # TODO: remove any test related resources in the project + + echo 'Uploading local logs to GCS bucket.' + gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/" + + if [[ -f "${tmp_dir}/tests_success" ]]; then + echo -e "${GREEN}Workflow succeeded, check logs at ${log_dir}/ or ${gcs_log_dir}/${NC}" + exit 0 + else + echo -e "${RED}Workflow failed, check logs at ${log_dir}/ or ${gcs_log_dir}/${NC}" + exit 1 + fi +} + +trap exit_handler EXIT + +# screen session name +session_name="manual-rapids-tests" + +gcloud config set project ${PROJECT_ID} +gcloud config set dataproc/region ${REGION} +gcloud auth login +gcloud config set compute/region ${REGION} + +export INTERNAL_IP_SSH="true" + +# Run tests in screen session so we can monitor the container in another window +screen -US "${session_name}" -c rapids/bazel.screenrc + + + diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 9cc708855..6c5c9d411 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -1,272 +1,531 @@ #!/bin/bash -set -euxo pipefail +# Copyright 2019,2020,2021,2022,2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -# Detect dataproc image version from its various names -if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" -fi +# This initialization action script will install rapids on a Dataproc +# cluster. -function os_id() { - grep '^ID=' /etc/os-release | cut -d= -f2 | xargs -} +set -euxo pipefail -function os_version() { - grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs +function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } +function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } +function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } +function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } +function is_debuntu() { is_debian || is_ubuntu ; } + +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ + -s -o ${tmpfile} 2>/dev/null) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} } -function os_codename() { - grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value ${url} + return_code=$? + return ${return_code} } -function is_rocky() { - [[ "$(os_id)" == 'rocky' ]] +function get_metadata_value() { + set +x + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + set -x + return ${return_code} } -function is_ubuntu() { - [[ "$(os_id)" == 'ubuntu' ]] -} +function get_metadata_attribute() ( + set +x + local -r attribute_name="$1" + local -r default_value="${2:-}" + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +) -function is_ubuntu20() { - is_ubuntu && [[ "$(os_version)" == '20.04'* ]] -} +function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } +function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } -function is_ubuntu22() { - is_ubuntu && [[ "$(os_version)" == '22.04'* ]] +function execute_with_retries() { + local -r cmd="$*" + for i in {0..9} ; do + if eval "$cmd"; then + return 0 ; fi + sleep 5 + done + echo "Cmd '${cmd}' failed." + return 1 } -function is_debian() { - [[ "$(os_id)" == 'debian' ]] -} +function configure_dask_yarn() { + readonly DASK_YARN_CONFIG_DIR=/etc/dask/ + readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml + # Minimal custom configuration is required for this + # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage + # for information on tuning Dask-Yarn environments. + mkdir -p "${DASK_YARN_CONFIG_DIR}" -function is_debian11() { - is_debian && [[ "$(os_version)" == '11'* ]] -} + cat <"${DASK_YARN_CONFIG_FILE}" +# Config file for Dask Yarn. +# +# These values are joined on top of the default config, found at +# https://yarn.dask.org/en/latest/configuration.html#default-configuration -function is_debian12() { - is_debian && [[ "$(os_version)" == '12'* ]] +yarn: + environment: python://${DASK_CONDA_ENV}/bin/python + + worker: + count: 2 + gpus: 1 + class: "dask_cuda.CUDAWorker" +EOF } -function os_vercat() { - if is_ubuntu ; then - os_version | sed -e 's/[^0-9]//g' - elif is_rocky ; then - os_version | sed -e 's/[^0-9].*$//g' +function install_systemd_dask_worker() { + echo "Installing systemd Dask Worker service..." + local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" + + mkdir -p "${dask_worker_local_dir}" + + local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh" + + cat <"${DASK_WORKER_LAUNCHER}" +#!/bin/bash +LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" +nvidia-smi -c DEFAULT +echo "dask-cuda-worker starting, logging to \${LOGFILE}" +${DASK_CONDA_ENV}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 +EOF + + chmod 750 "${DASK_WORKER_LAUNCHER}" + + local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service" + cat <"${dask_service_file}" +[Unit] +Description=Dask Worker Service +[Service] +Type=simple +Restart=on-failure +ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}' +[Install] +WantedBy=multi-user.target +EOF + chmod a+r "${dask_service_file}" + + systemctl daemon-reload + + # Enable the service + if [[ "${ROLE}" != "Master" ]]; then + enable_worker_service="1" else - os_version + local RUN_WORKER_ON_MASTER=$(get_metadata_attribute dask-cuda-worker-on-master 'true') + # Enable service on single-node cluster (no workers) + local worker_count="$(get_metadata_attribute dataproc-worker-count)" + if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then + enable_worker_service="1" + fi + fi + + if [[ "${enable_worker_service}" == "1" ]]; then + systemctl enable "${DASK_WORKER_SERVICE}" + systemctl restart "${DASK_WORKER_SERVICE}" fi } -function get_metadata_attribute() { - local -r attribute_name=$1 - local -r default_value=$2 - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +function install_systemd_dask_scheduler() { + # only run scheduler on primary master + if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi + echo "Installing systemd Dask Scheduler service..." + local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}" + + mkdir -p "${dask_scheduler_local_dir}" + + local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh" + + cat <"${DASK_SCHEDULER_LAUNCHER}" +#!/bin/bash +LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log" +echo "dask scheduler starting, logging to \${LOGFILE}" +${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1 +EOF + + chmod 750 "${DASK_SCHEDULER_LAUNCHER}" + + local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service" + cat <"${dask_service_file}" +[Unit] +Description=Dask Scheduler Service +[Service] +Type=simple +Restart=on-failure +ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}' +[Install] +WantedBy=multi-user.target +EOF + chmod a+r "${dask_service_file}" + + systemctl daemon-reload + + # Enable the service + systemctl enable "${DASK_SCHEDULER_SERVICE}" } -readonly DEFAULT_DASK_RAPIDS_VERSION="23.12" -readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) - -readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="22.10.0" - -if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then - readonly DEFAULT_CUDA_VERSION="11.8" - readonly DEFAULT_XGBOOST_VERSION="2.0.3" - readonly SPARK_VERSION="${SPARK_VERSION_ENV}" -else - readonly DEFAULT_CUDA_VERSION="10.1" - readonly DEFAULT_XGBOOST_VERSION="1.0.0" - readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="Beta5" - readonly SPARK_VERSION="2.x" -fi - -readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) -readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) - -readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') -readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') - -# RAPIDS config -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) -if [[ "${CUDA_VERSION%%.*}" == 12 ]]; then - # at the time of writing 20240721 there is no support for the 12.x - # releases of cudatoolkit package in mamba. For the time being, - # we will use a maximum of 11.8 - CUDA_VERSION="11.8" -fi -readonly CUDA_VERSION - -# SPARK config -readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) -readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) -readonly XGBOOST_GPU_SUB_VERSION=$(get_metadata_attribute 'spark-gpu-sub-version' ${DEFAULT_XGBOOST_GPU_SUB_VERSION}) - -# Scala config -readonly SCALA_VER="2.12" - -# Dask config -readonly DASK_LAUNCHER=/usr/local/bin/dask-launcher.sh -readonly DASK_SERVICE=dask-cluster -readonly DASK_YARN_CONFIG_FILE=/etc/dask/config.yaml - -# Dataproc configurations -readonly SPARK_CONF_DIR='/etc/spark/conf' +function install_systemd_dask_service() { + install_systemd_dask_scheduler + install_systemd_dask_worker +} -function execute_with_retries() { - local -r cmd=$1 - for ((i = 0; i < 10; i++)); do - if eval "$cmd"; then - return 0 - fi - sleep 5 - done - return 1 +function restart_knox() { + systemctl stop knox + rm -rf "${KNOX_HOME}/data/deployments/*" + systemctl start knox } -function install_dask_rapids() { - if is_debian11 || is_debian12 || is_ubuntu20 || is_ubuntu22 ; then - local python_ver="3.10" - else - local python_ver="3.9" +function configure_knox_for_dask() { + if [[ ! -d "${KNOX_HOME}" ]]; then + echo "Skip configuring Knox rules for Dask" + return 0 fi - # Install RAPIDS, cudatoolkit - mamba install -m -n 'dask-rapids' -y --no-channel-priority -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ - "cudatoolkit=${CUDA_VERSION}" "pandas<1.5" "rapids=${RAPIDS_VERSION}" "python=${python_ver}" -} -function install_spark_rapids() { - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - - if [[ "${SPARK_VERSION}" == "3"* ]]; then - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_${SCALA_VER}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${SCALA_VER}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_${SCALA_VER}/${XGBOOST_VERSION}/xgboost4j-gpu_${SCALA_VER}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_${SCALA_VER}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VER}-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - else - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${rapids_repo_url}/xgboost4j-spark_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j-spark_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${rapids_repo_url}/xgboost4j_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ - -P /usr/lib/spark/jars/ + local DASK_UI_PORT=8787 + if [[ -f /etc/knox/conf/topologies/default.xml ]]; then + sed -i \ + "/<\/topology>/i DASK<\/role>http://localhost:${DASK_UI_PORT}<\/url><\/service> DASKWS<\/role>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \ + /etc/knox/conf/topologies/default.xml fi -} -function configure_spark() { - if [[ "${SPARK_VERSION}" == "3"* ]]; then - cat >>${SPARK_CONF_DIR}/spark-defaults.conf <"${KNOX_DASK_DIR}/service.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + EOF - else - cat >>${SPARK_CONF_DIR}/spark-defaults.conf <"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + EOF - fi -} -configure_systemd_dask_service() { - echo "Configuring systemd Dask service for RAPIDS..." - local -r dask_worker_local_dir="/tmp/dask" - local conda_env_bin - conda_env_bin=$(conda info --base)/bin + mkdir -p "${KNOX_DASKWS_DIR}" - # Replace Dask Launcher file with dask-cuda config - systemctl stop ${DASK_SERVICE} + cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF' + - if [[ "${ROLE}" == "Master" ]]; then - cat <"${DASK_LAUNCHER}" -#!/bin/bash -if [[ "${RUN_WORKER_ON_MASTER}" == true ]]; then - nvidia-smi -c DEFAULT - echo "dask-cuda-worker starting, logging to /var/log/dask-cuda-worker.log." - ${conda_env_bin}/dask-cuda-worker ${MASTER}:8786 --local-directory=${dask_worker_local_dir} --memory-limit=auto > /var/log/dask-cuda-worker.log 2>&1 & -fi -echo "dask-scheduler starting, logging to /var/log/dask-scheduler.log." -${conda_env_bin}/dask-scheduler > /var/log/dask-scheduler.log 2>&1 + + + + + + + + + + + + + + + + + EOF - else - nvidia-smi -c DEFAULT - cat <"${DASK_LAUNCHER}" -#!/bin/bash -${conda_env_bin}/dask-cuda-worker ${MASTER}:8786 --local-directory=${dask_worker_local_dir} --memory-limit=auto > /var/log/dask-cuda-worker.log 2>&1 + + cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF' + + + + + + + EOF - fi - chmod 750 "${DASK_LAUNCHER}" - systemctl daemon-reload - echo "Restarting Dask cluster..." - systemctl start "${DASK_SERVICE}" + chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}" + + # Do not restart knox during pre-init script run + if [[ -n "${ROLE}" ]]; then + restart_knox + fi } -function configure_dask_yarn() { - local base - base=$(conda info --base) +function configure_fluentd_for_dask() { + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + cat >/etc/google-fluentd/config.d/dataproc-dask.conf < + @type tail + path /var/log/dask-scheduler.log + pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos + read_from_head true + tag google.dataproc.dask-scheduler + + @type none + + + + + @type record_transformer + + filename dask-scheduler.log + + +EOF + fi - # Replace config file on cluster. - cat <"${DASK_YARN_CONFIG_FILE}" -# Config file for Dask Yarn. -# -# These values are joined on top of the default config, found at -# https://yarn.dask.org/en/latest/configuration.html#default-configuration + if [[ "${enable_worker_service}" == "1" ]]; then + cat >>/etc/google-fluentd/config.d/dataproc-dask.conf < + @type tail + path /var/log/dask-worker.log + pos_file /var/tmp/fluentd.dataproc.dask.worker.pos + read_from_head true + tag google.dataproc.dask-worker + + @type none + + + + + @type record_transformer + + filename dask-worker.log + + +EOF + fi -yarn: - environment: python://${base}/bin/python + systemctl restart google-fluentd +} - worker: - count: 2 - gpus: 1 - class: "dask_cuda.CUDAWorker" -EOF +function install_dask_rapids() { + if is_cuda12 ; then + local python_spec="python>=3.11" + local cuda_spec="cuda-version>=12,<13" + local dask_spec="dask>=2024.7" + local numba_spec="numba" + elif is_cuda11 ; then + local python_spec="python>=3.9" + local cuda_spec="cuda-version>=11,<12.0a0" + local dask_spec="dask" + local numba_spec="numba" + fi + + rapids_spec="rapids>=${RAPIDS_VERSION}" + CONDA_PACKAGES=() + if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then + # Pin `distributed` and `dask` package versions to old release + # because `dask-yarn` 0.9 uses skein in a way which + # is not compatible with `distributed` package 2022.2 and newer: + # https://github.com/dask/dask-yarn/issues/155 + + dask_spec="dask<2022.2" + python_spec="python>=3.7,<3.8.0a0" + rapids_spec="rapids<=24.05" + if is_ubuntu18 ; then + # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic + CONDA_PACKAGES+=("fiona<1.8.22") + fi + CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2") + fi + + CONDA_PACKAGES+=( + "${cuda_spec}" + "${rapids_spec}" + "${dask_spec}" + "dask-bigquery" + "dask-ml" + "dask-sql" + "cudf" + "${numba_spec}" + ) + + # Install cuda, rapids, dask + mamba="/opt/conda/miniconda3/bin/mamba" + conda="/opt/conda/miniconda3/bin/conda" + + "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]" + + ( set +e + local is_installed="0" + for installer in "${mamba}" "${conda}" ; do + test -d "${DASK_CONDA_ENV}" || \ + time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ + -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + ${CONDA_PACKAGES[*]} \ + "${python_spec}" \ + > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + sync + if [[ "$retval" == "0" ]] ; then + is_installed="1" + break + fi + "${conda}" config --set channel_priority flexible + done + if [[ "${is_installed}" == "0" ]]; then + echo "failed to install dask" + return 1 + fi + ) } function main() { - if [[ "${RUNTIME}" == "DASK" ]]; then - # Install RAPIDS - install_dask_rapids - - # In "standalone" mode, Dask relies on a shell script to launch. - # In "yarn" mode, it relies a config.yaml file. - if [[ -f "${DASK_LAUNCHER}" ]]; then - configure_systemd_dask_service - elif [[ -f "${DASK_YARN_CONFIG_FILE}" ]]; then - configure_dask_yarn - fi - echo "RAPIDS installed with Dask runtime" - elif [[ "${RUNTIME}" == "SPARK" ]]; then - install_spark_rapids - configure_spark - echo "RAPIDS initialized with Spark runtime" + # Install Dask with RAPIDS + install_dask_rapids + + # In "standalone" mode, Dask relies on a systemd unit to launch. + # In "yarn" mode, it relies a config.yaml file. + if [[ "${DASK_RUNTIME}" == "yarn" ]]; then + # Create Dask YARN config file + configure_dask_yarn else - echo "Unsupported RAPIDS Runtime: ${RUNTIME}" - exit 1 + # Create Dask service + install_systemd_dask_service + + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + systemctl start "${DASK_SCHEDULER_SERVICE}" + systemctl status "${DASK_SCHEDULER_SERVICE}" + fi + + echo "Starting Dask 'standalone' cluster..." + if [[ "${enable_worker_service}" == "1" ]]; then + systemctl start "${DASK_WORKER_SERVICE}" + systemctl status "${DASK_WORKER_SERVICE}" + fi + + configure_knox_for_dask + + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" + if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then + configure_fluentd_for_dask + fi fi + echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized." if [[ "${ROLE}" == "Master" ]]; then systemctl restart hadoop-yarn-resourcemanager.service # Restart NodeManager on Master as well if this is a single-node-cluster. - if systemctl status hadoop-yarn-nodemanager; then + if systemctl list-units | grep hadoop-yarn-nodemanager; then systemctl restart hadoop-yarn-nodemanager.service fi else @@ -274,4 +533,131 @@ function main() { fi } +function exit_handler() ( + set +e + echo "Exit handler invoked" + + # Free conda cache + /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1 + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # remove the tmpfs conda pkgs_dirs + if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do + if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then + rm -rf ${shmdir}/* + umount -f ${shmdir} + fi + done + + # Clean up OS package cache ; re-hold systemd package + if is_debuntu ; then + apt-get -y -qq clean + apt-get -y -qq autoremove + else + dnf clean all + fi + + # print disk usage statistics + if is_debuntu ; then + # Rocky doesn't have sort -h and fails when the argument is passed + du --max-depth 3 -hx / | sort -h | tail -10 + fi + + # Process disk usage logs from installation period + rm -f "${tmpdir}/keep-running-df" + sleep 6s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem Size Used Avail Use% Mounted on +#/dev/vda2 6.8G 2.5G 4.0G 39% / + df -h / | tee -a "${tmpdir}/disk-usage.log" + perl -e '$max=( sort + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } )[-1]; +print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log" + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero ; sync ; rm -f /zero + fi + + return 0 +) + +function prepare_to_install(){ + readonly DEFAULT_CUDA_VERSION="12.4" + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) + readonly CUDA_VERSION + + readonly ROLE=$(get_metadata_attribute dataproc-role) + readonly MASTER=$(get_metadata_attribute dataproc-master) + + # RAPIDS config + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') + readonly RAPIDS_RUNTIME + + readonly DEFAULT_DASK_RAPIDS_VERSION="24.08" + readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) + + # Dask config + DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" + readonly DASK_RUNTIME + readonly DASK_SERVICE=dask-cluster + readonly DASK_WORKER_SERVICE=dask-worker + readonly DASK_SCHEDULER_SERVICE=dask-scheduler + readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask-rapids" + + # Knox config + readonly KNOX_HOME=/usr/lib/knox + readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0" + readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" + enable_worker_service="0" + + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + # Write to a ramdisk instead of churning the persistent disk + if [[ ${free_mem} -ge 5250000 ]]; then + tmpdir=/mnt/shm + mkdir -p /mnt/shm + mount -t tmpfs tmpfs /mnt/shm + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm + mount -t tmpfs tmpfs /mnt/shm + + # Download pip packages to tmpfs + pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi + else + tmpdir=/tmp + fi + install_log="${tmpdir}/install.log" + trap exit_handler EXIT + + # Monitor disk usage in a screen session + if is_debuntu ; then + apt-get install -y -qq screen + else + dnf -y -q install screen + fi + df -h / | tee "${tmpdir}/disk-usage.log" + touch "${tmpdir}/keep-running-df" + screen -d -m -US keep-running-df \ + bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done" +} + +prepare_to_install + main diff --git a/rapids/run-bazel-tests.sh b/rapids/run-bazel-tests.sh new file mode 100644 index 000000000..4c2ca20a6 --- /dev/null +++ b/rapids/run-bazel-tests.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Run from root directory of initialization-actions checkout + +IMAGE="rapids-actions-image:$BUILD_ID" +max_parallel_tests=10 + +IMAGE_VERSION="$1" +if [[ -z "${IMAGE_VERSION}" ]] ; then + IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" ; fi ; export IMAGE_VERSION + +#declare -a TESTS_TO_RUN=('dask:test_dask' 'rapids:test_rapids') +#declare -a TESTS_TO_RUN=('dask:test_dask') +declare -a TESTS_TO_RUN=('rapids:test_rapids') + +time bazel test \ + --jobs="${max_parallel_tests}" \ + --local_test_jobs="${max_parallel_tests}" \ + --flaky_test_attempts=3 \ + --action_env="INTERNAL_IP_SSH=true" \ + --test_output="errors" \ + --test_arg="--image_version=${IMAGE_VERSION}" \ + "${TESTS_TO_RUN[@]}" diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 4df22249b..63fa72a7f 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -1,4 +1,5 @@ import os +import time import pkg_resources from absl.testing import absltest @@ -6,132 +7,74 @@ from integration_tests.dataproc_test_case import DataprocTestCase - class RapidsTestCase(DataprocTestCase): COMPONENT = "rapids" - INIT_ACTIONS = ["gpu/install_gpu_driver.sh", "rapids/rapids.sh"] - DASK_INIT_ACTIONS = [ - "gpu/install_gpu_driver.sh", "dask/dask.sh", "rapids/rapids.sh" + INIT_ACTIONS = [ + "gpu/install_gpu_driver.sh", "rapids/rapids.sh" ] - GPU_P100 = "type=nvidia-tesla-p100" - GPU_T4 = "type=nvidia-tesla-t4" + GPU_A100 = "type=nvidia-tesla-a100,count=2" + GPU_H100 = "type=nvidia-h100-80gb,count=2" + GPU_T4 = "type=nvidia-tesla-t4" # Tests for RAPIDS init action DASK_RAPIDS_TEST_SCRIPT_FILE_NAME = "verify_rapids_dask.py" - XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark.scala" - def verify_dask_instance(self, name): + def verify_dask_worker_service(self, name): + # Retry the first ssh to ensure it has enough time to propagate SSH keys + for try_number in range(0, 3): + try: + self.assert_instance_command( + name, "[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]") + break + except: + time.sleep(2**try_number) + + def verify_dask_config(self, name): self.assert_instance_command( - name, "pgrep -f dask-cuda-worker || " - "grep 'class: \"dask_cuda.CUDAWorker\"' /etc/dask/config.yaml") + name, "grep 'class: \"dask_cuda.CUDAWorker\"' /etc/dask/config.yaml") - self.upload_test_file( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME), name) - verify_cmd = "/opt/conda/default/bin/python {}".format( + def run_dask_script(self, name): + test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), + self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) + verify_cmd = "/opt/conda/miniconda3/envs/dask-rapids/bin/python {}".format( self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME, name) - def verify_spark_instance(self, name): - self.assert_instance_command(name, "nvidia-smi") - - def verify_spark_job(self): - instance_name = "{}-m".format(self.getClusterName()) - self.upload_test_file( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME), instance_name) - self.assert_instance_command( - instance_name, """echo :quit | spark-shell \ - --conf spark.executor.resource.gpu.amount=1 \ - --conf spark.task.resource.gpu.amount=1 \ - --conf spark.dynamicAllocation.enabled=false -i {}""".format( - self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME)) - self.remove_test_script(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME, - instance_name) - - @parameterized.parameters(("STANDARD", ["m", "w-0"], GPU_T4, None), - ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), - ("STANDARD", ["m"], GPU_T4, "standalone")) + + @parameterized.parameters( +# If a new version of dask-yarn is released, add this test back in. +# ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), +# ("STANDARD", ["m"], GPU_T4, None), + ("STANDARD", ["m", "w-0"], GPU_T4, "standalone") + ) def test_rapids_dask(self, configuration, machine_suffixes, accelerator, dask_runtime): - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") - metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=DASK" if dask_runtime: metadata += ",dask-runtime={}".format(dask_runtime) - self.createCluster( - configuration, - self.DASK_INIT_ACTIONS, - metadata=metadata, - machine_type="n1-standard-4", - master_accelerator=accelerator, - worker_accelerator=accelerator, - boot_disk_size="200GB", - timeout_in_minutes=30) - - for machine_suffix in machine_suffixes: - self.verify_dask_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - - @parameterized.parameters(("SINGLE", ["m"], GPU_T4), - ("STANDARD", ["w-0"], GPU_T4)) - def test_rapids_spark(self, configuration, machine_suffixes, accelerator): - - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") - optional_components = None - - metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK") - self.createCluster( configuration, self.INIT_ACTIONS, - optional_components=optional_components, metadata=metadata, - machine_type="n1-standard-4", - master_accelerator=accelerator if configuration == "SINGLE" else None, + machine_type="n1-standard-8", + master_accelerator=accelerator, worker_accelerator=accelerator, - boot_disk_size="200GB", - timeout_in_minutes=30) + boot_disk_size="50GB", + timeout_in_minutes=60) for machine_suffix in machine_suffixes: - self.verify_spark_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - # Only need to do this once - self.verify_spark_job() - - @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4")) - def test_non_default_cuda_versions(self, configuration, machine_suffixes, - accelerator, cuda_version): + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + if dask_runtime == 'standalone' or dask_runtime == None: + self.verify_dask_worker_service(machine_name) + elif dask_runtime == 'yarn': + self.verify_dask_config(machine_name) - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") - - metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" - ",cuda-version={}".format(cuda_version)) - - self.createCluster( - configuration, - self.INIT_ACTIONS, - metadata=metadata, - machine_type="n1-standard-4", - master_accelerator=accelerator if configuration == "SINGLE" else None, - worker_accelerator=accelerator, - boot_disk_size="200GB", - timeout_in_minutes=30) - - for machine_suffix in machine_suffixes: - self.verify_spark_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - # Only need to do this once - self.verify_spark_job() + self.run_dask_script(machine_name) if __name__ == "__main__": diff --git a/rapids/verify_rapids_dask.py b/rapids/verify_rapids_dask.py index f8c888478..10f662215 100644 --- a/rapids/verify_rapids_dask.py +++ b/rapids/verify_rapids_dask.py @@ -2,11 +2,9 @@ import dask_cudf import xgboost -from dask.distributed import Client import dask.array as da import numpy as np - def test_rapids(): # confirm RAPIDS and xgboost are available df = cudf.DataFrame() @@ -19,21 +17,4 @@ def test_rapids(): ds = dask_cudf.from_cudf(df['c'], npartitions=2) ds.compute() - -def test_dask_yarn(): - try: - from dask_yarn import YarnCluster - except: - return - - # Validate dask_yarn configuration - cluster = YarnCluster() - client = Client(cluster) - - cluster.scale(4) - x = da.sum(np.ones(5)) - x.compute() - - test_rapids() -test_dask_yarn() diff --git a/rapids/verify_rapids_dask_yarn.py b/rapids/verify_rapids_dask_yarn.py new file mode 100644 index 000000000..9c6850cc4 --- /dev/null +++ b/rapids/verify_rapids_dask_yarn.py @@ -0,0 +1,19 @@ +from dask.distributed import Client +import dask.array as da +import numpy as np + +def test_dask_yarn(): + try: + from dask_yarn import YarnCluster + except: + return + + # Validate dask_yarn configuration + cluster = YarnCluster() + client = Client(cluster) + + cluster.scale(4) + x = da.sum(np.ones(5)) + x.compute() + +test_dask_yarn() # known to fail for recent relases of rapids