diff --git a/.github/workflows/testflinger-contrib-dss-regression.yaml b/.github/workflows/testflinger-contrib-dss-regression.yaml index 217c2088f2..b40e1b466d 100644 --- a/.github/workflows/testflinger-contrib-dss-regression.yaml +++ b/.github/workflows/testflinger-contrib-dss-regression.yaml @@ -21,36 +21,31 @@ jobs: run: working-directory: contrib/checkbox-dss-validation strategy: + fail-fast: false matrix: dss_channel: - latest/stable - latest/edge + microk8s_channel: + - 1.28/stable + - 1.31/stable queue: - - dell-precision-3470-c30322 #ADL iGPU + NVIDIA GPU - - dell-precision-5680-c31665 #RPL iGPU + Arc Pro A60M dGPU + - name: dell-precision-3470-c30322 #ADL iGPU + NVIDIA GPU + provision_data: "distro: jammy" + - name: dell-precision-5680-c31665 #RPL iGPU + Arc Pro A60M dGPU + provision_data: "url: http://10.102.196.9/somerville/Platforms/jellyfish-muk/X96_A00/dell-bto-jammy-jellyfish-muk-X96-20230419-19_A00.iso" + - name: nvidia-dgx-station-c25989 # NO iGPU + NVIDIA GPU + provision_data: "distro: jammy" steps: - name: Check out code uses: actions/checkout@v4 - - name: Build job file from template with maas2 provisioning - if: ${{ matrix.queue == 'dell-precision-3470-c30322' }} - env: - PROVISION_DATA: "distro: jammy" + - name: Build job file from template run: | sed -e "s|REPLACE_BRANCH|${BRANCH}|" \ - -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \ - -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \ - -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \ - ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \ - ${GITHUB_WORKSPACE}/job.yaml - - name: Build job file from template with oemscript provisioning - if: ${{ matrix.queue == 'dell-precision-5680-c31665' }} - env: - PROVISION_DATA: "url: http://10.102.196.9/somerville/Platforms/jellyfish-muk/X96_A00/dell-bto-jammy-jellyfish-muk-X96-20230419-19_A00.iso" - run: | - sed -e "s|REPLACE_BRANCH|${BRANCH}|" \ - -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \ - -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \ + -e "s|REPLACE_QUEUE|${{ matrix.queue.name }}|" \ + -e "s|REPLACE_PROVISION_DATA|${{ matrix.queue.provision_data }}|" \ -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \ + -e "s|REPLACE_MICROK8S_CHANNEL|${{ matrix.microk8s_channel }}|" \ ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \ ${GITHUB_WORKSPACE}/job.yaml - name: Submit testflinger job diff --git a/contrib/checkbox-dss-validation/README.md b/contrib/checkbox-dss-validation/README.md index d2d06111c8..d80926d07a 100644 --- a/contrib/checkbox-dss-validation/README.md +++ b/contrib/checkbox-dss-validation/README.md @@ -1,12 +1,14 @@ # Welcome to the Checkbox DSS project! -This repository contains the Checkbox DSS Provider (test cases and test plans for validating Intel GPU support in the [Data Science Stack](https://documentation.ubuntu.com/data-science-stack/en/latest/)) as well as everything that is required to build the `checkbox-dss` snap. +This repository contains the Checkbox DSS Provider (test cases and test plans for validating Intel and NVIDIA GPU support in the [Data Science Stack](https://documentation.ubuntu.com/data-science-stack/en/latest/)) as well as everything that is required to build the `checkbox-dss` snap. # Requirements - Ubuntu Jammy (22.04) - Supported hardware platforms: + - No GPUs - Intel platforms with recent GPU (>= Broadwell) + - Recent NVIDIA GPU # Installation @@ -19,7 +21,7 @@ lxd init --auto git clone https://github.com/canonical/checkbox cd checkbox/contrib/checkbox-dss-validation snapcraft -sudo snap install --dangerous --classic ./checkbox-dss_2.0_amd64.snap +sudo snap install --dangerous --classic ./checkbox-dss_3.0_amd64.snap ``` Make sure that the provider service is running and active: @@ -40,15 +42,27 @@ By default this will install the `data-science-stack` snap from the `latest/stab channel. To instead install from `latest/edge` use: ```shell -checkbox-dss.install-deps --dss-snap-channel=latest/edge +checkbox-dss.install-deps --dss-snap-channel latest/edge ``` +Furthermore, the default `microk8s` snap channel is `1.28/stable` in classic mode, +but this can be customized as +(please note that this snap must to be `--classic` to enable GPU support): + +```shell +checkbox-dss.install-deps --microk8s-snap-channel 1.31/stable +``` + +These validations also need the `kubectl` snap installed, and the default channel +used for that is `1.29/stable`, but can be customized as shown previously by passing +the appropriate channel name for `--kubectl-snap-channel`. + # Automated Run To run the test plans: ```shell -checkbox-dss.validate-intel-gpu +checkbox-dss.validate-with-gpu ``` # Cleanup diff --git a/contrib/checkbox-dss-validation/bin/install-deps b/contrib/checkbox-dss-validation/bin/install-deps index aaefb6dff1..59f7263cdd 100755 --- a/contrib/checkbox-dss-validation/bin/install-deps +++ b/contrib/checkbox-dss-validation/bin/install-deps @@ -1,56 +1,89 @@ #!/bin/bash set -e -echo -e "\nStep 1/5: Installing microk8s snap" -sudo snap install microk8s --channel 1.28/stable --classic - -USER=$(id -nu ${SNAP_UID}) -HOME=${SNAP_REAL_HOME} - -# microk8s commands run from tests are run without sudo -sudo usermod -a -G microk8s $USER -# Directory needed for sharing microk8s config with kubectl snap -mkdir -p $HOME/.kube - -echo -e "\nStep 2/5: Configuring microk8s addons" -sudo microk8s status --wait-ready -# Give microk8s another minute to stabilize -# to avoid intermittent failures when -# enabling hostpath-storage -echo "Giving microk8s a minute to stabilize..." -sleep 60 -sudo microk8s enable hostpath-storage -sudo microk8s enable dns -sudo microk8s enable rbac - -echo "Waiting for microk8s addons to become ready..." -sudo microk8s.kubectl wait \ - --for=condition=available \ - --timeout 1800s \ - -n kube-system \ - deployment/coredns \ - deployment/hostpath-provisioner -sudo microk8s.kubectl -n kube-system rollout status ds/calico-node - -# This is needed to overcome the following bug within microk8s: -# https://github.com/canonical/microk8s/issues/4453 -echo -e "\nStep 3/5: Installing kubectl snap" -sudo snap install kubectl --classic --channel=1.29/stable -# hack as redirecting stdout anywhere but /dev/null throws a permission denied error -# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4 -sudo microk8s.kubectl config view --raw | tee $HOME/.kube/config > /dev/null - -# intel_gpu_top command used for host-level GPU check -# jq used for cases where jsonpath is insufficient for parsing json results -echo -e "\nStep 4/5: Installing intel-gpu-tools" -DEBIAN_FRONTEND=noninteractive sudo apt install -y intel-gpu-tools jq - -echo -e "\nStep 5/5: Installing data-science-stack snap" -optional_arg=$1 -if [ "${optional_arg}" = "--dss-snap-channel=latest/edge" ]; then - echo "Installing from edge" - sudo snap install data-science-stack --channel latest/edge -else - echo "Installing from stable" - sudo snap install data-science-stack --channel latest/stable -fi +dss_snap_channel="latest/stable" +microk8s_snap_channel="1.28/stable" +kubectl_snap_channel="1.29/stable" + +setup_microk8s_snap() { + echo -e "\nInstalling microk8s snap from channel $1" + sudo snap install microk8s --channel "$1" --classic + + SNAP_USER=$(id -nu "${SNAP_UID}") + + # microk8s commands run from tests are run without sudo + sudo usermod -a -G microk8s "$SNAP_USER" + # Directory needed for sharing microk8s config with kubectl snap + mkdir -p "${SNAP_REAL_HOME}/.kube" + + echo -e "\nConfiguring microk8s addons" + sudo microk8s status --wait-ready + # Give microk8s another minute to stabilize + # to avoid intermittent failures when + # enabling hostpath-storage + echo "Giving microk8s a minute to stabilize..." + sleep 60 + sudo microk8s enable hostpath-storage + sudo microk8s enable dns + sudo microk8s enable rbac + + echo "Waiting for microk8s addons to become ready..." + sudo microk8s.kubectl wait \ + --for=condition=available \ + --timeout 1800s \ + -n kube-system \ + deployment/coredns \ + deployment/hostpath-provisioner + sudo microk8s.kubectl -n kube-system rollout status ds/calico-node +} + +setup_kubectl_snap() { + # This is needed to overcome the following bug within microk8s: + # https://github.com/canonical/microk8s/issues/4453 + echo -e "\nInstalling kubectl snap from channel $1" + sudo snap install kubectl --classic --channel="$1" + # hack as redirecting stdout anywhere but /dev/null throws a permission denied error + # see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4 + sudo microk8s.kubectl config view --raw | tee "${SNAP_REAL_HOME}/.kube/config" >/dev/null +} + +help_function() { + echo "This script is used install all dependencies for checkbox-dss to run; defaults for optional arguments are shown in usage" + echo "Usage: checkbox-dss.install-deps [--dss-snap-channel $dss_snap_channel] [--microk8s-snap-channel $microk8s_snap_channel] [--kubectl-snap-channel $kubectl_snap_channel]" +} + +main() { + while [ $# -ne 0 ]; do + case $1 in + --dss-snap-channel) + dss_snap_channel="$2" + shift 2 + ;; + --microk8s-snap-channel) + microk8s_snap_channel="$2" + shift 2 + ;; + --kubectl-snap-channel) + kubectl_snap_channel="$2" + shift 2 + ;; + *) help_function ;; + esac + done + + echo -e "\n Step 1/4: Setting up microk8s" + setup_microk8s_snap "$microk8s_snap_channel" + + echo -e "\n Step 2/4: Setting up kubectl" + setup_kubectl_snap "$kubectl_snap_channel" + + # intel_gpu_top command used for host-level GPU check + # jq used for cases where jsonpath is insufficient for parsing json results + echo -e "\nStep 3/4: Installing intel-gpu-tools" + DEBIAN_FRONTEND=noninteractive sudo apt install -y intel-gpu-tools jq + + echo -e "\nStep 4/4: Installing data-science-stack snap from channel $dss_snap_channel" + sudo snap install data-science-stack --channel "$dss_snap_channel" +} + +main "$@" diff --git a/contrib/checkbox-dss-validation/bin/validate-intel-gpu b/contrib/checkbox-dss-validation/bin/validate-with-gpu similarity index 79% rename from contrib/checkbox-dss-validation/bin/validate-intel-gpu rename to contrib/checkbox-dss-validation/bin/validate-with-gpu index cfd23ddbb9..08f50af6b9 100755 --- a/contrib/checkbox-dss-validation/bin/validate-intel-gpu +++ b/contrib/checkbox-dss-validation/bin/validate-with-gpu @@ -1,4 +1,4 @@ -#!/usr/bin/env -S checkbox-cli-wrapper remote 127.0.0.1 +#!/usr/bin/env -S checkbox-cli-wrapper control 127.0.0.1 [launcher] app_id = com.canonical.contrib.dss-validation:checkbox launcher_version = 1 @@ -14,5 +14,5 @@ forced = yes [ui] type = silent auto_retry = yes -max_attempts = 10 +max_attempts = 2 delay_before_retry = 10 diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh new file mode 100755 index 0000000000..3bcfc8059d --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +check_nvidia_gpu_addon_can_be_enabled() { + # TODO: enable changing GPU_OPERATOR_VERSION + GPU_OPERATOR_VERSION=24.6.2 + echo "[INFO]: enabling the NVIDIA GPU addon" + sudo microk8s enable gpu --driver=operator --version="$GPU_OPERATOR_VERSION" + SLEEP_SECS=10 + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU feature discovery has rolled out." + sleep ${SLEEP_SECS} + microk8s.kubectl -n gpu-operator-resources rollout status ds/gpu-operator-node-feature-discovery-worker + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if daemonsets have rolled out." + sleep ${SLEEP_SECS} + microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-device-plugin-daemonset + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations have rolled out." + sleep ${SLEEP_SECS} + echo "[INFO]: Waiting for the GPU validations to rollout" + microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-operator-validator + echo "Test success: NVIDIA GPU addon enabled." +} + +check_nvidia_gpu_validations_succeed() { + SLEEP_SECS=5 + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if GPU validations were successful." + sleep ${SLEEP_SECS} + result=$(microk8s.kubectl logs -n gpu-operator-resources -lapp=nvidia-operator-validator -c nvidia-operator-validator) + if [ "${result}" = "all validations are successful" ]; then + echo "Test success: NVIDIA GPU validations were successful!" + else + >&2 echo "Test failure: NVIDIA GPU validations were not successful, got ${result}" + exit 1 + fi +} + +help_function() { + echo "This script is used for tests related to CUDA" + echo "Usage: check_dss.sh " + echo + echo "Test cases currently implemented:" + echo -e "\t: check_nvidia_gpu_addon_can_be_enabled" + echo -e "\t: check_nvidia_gpu_validations_succeed" +} + +main() { + case ${1} in + gpu_addon_can_be_enabled) check_nvidia_gpu_addon_can_be_enabled ;; + gpu_validations_succeed) check_nvidia_gpu_validations_succeed ;; + *) help_function ;; + esac +} + +main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh index f3530a39fc..8a75be22a0 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh @@ -13,7 +13,6 @@ export -n PYTHONHOME PYTHONPATH PYTHONUSERBASE check_dss_can_be_initialized() { # TODO: we actually seem to initialize dss here; maybe split it out - cd "${HOME}" dss initialize --kubeconfig="$(sudo microk8s config)" echo "Test success: dss initialized." } @@ -27,84 +26,61 @@ check_dss_namespace_is_deployed() { fi } -check_mlflow_status_is_ready() { - cd "${HOME}" +check_dss_status_contains() { result=$(dss status) # save result to shell var to avoid broken pipe error - if echo "${result}" | grep -q "MLflow deployment: Ready"; then - echo "Test success: 'dss status' shows ready status for mlflow." + if echo "${result}" | grep -q "${1}"; then + echo "Test success: 'dss status' shows '$1'." else - >&2 echo "Test failure: 'dss status' does not show ready status for mlflow." + >&2 echo "Test failure: 'dss status' does not show '$1'." exit 1 fi } -check_mlflow_is_deployed_as_first_service() { - # TODO: enable mlflow to be a service in any position - result=$(microk8s.kubectl get service -n dss -o jsonpath='{.items[0].metadata.name}') - if [ "${result}" = "mlflow" ]; then - echo "Test success: 'mlflow' service is deployed!" +check_dss_can_create_notebook() { + if dss create "${@}"; then + echo "Test success: successfully created notebook with '$*'." else - >&2 echo "Test failure: expected service name 'mlflow' but got ${result}" + >&2 echo "Test failure: failed to create notebook with '$*'." exit 1 fi } -check_dss_has_intel_gpu_acceleration_enabled() { - cd "${HOME}" - result=$(dss status) # save result to shell var to avoid broken pipe error - if echo "${result}" | grep -q "Intel GPU acceleration: Enabled"; then - echo "Test success: 'dss status' correctly reports Intel GPU status." - else - >&2 echo "Test failure: 'dss status' does not report that Intel GPU acceleration is enabled." - exit 1 - fi -} - -check_dss_can_create_itex_215_notebook() { - cd "${HOME}" - if dss create itex-215-notebook --image=intel/intel-extension-for-tensorflow:2.15.0-xpu-idp-jupyter; then - echo "Test success: successfully created an ITEX 2.15 notebook." - else - >&2 echo "Test failure: failed to create an ITEX 2.15 notebook." - exit 1 - fi -} - -check_dss_can_create_ipex_2120_notebook() { - cd "${HOME}" - if dss create ipex-2120-notebook --image=intel/intel-extension-for-pytorch:2.1.20-xpu-idp-jupyter; then - echo "Test success: successfully created an IPEX 2.1.20 notebook." +check_dss_can_remove_notebook() { + if dss remove "$@"; then + echo "Test success: successfully removed notebook with '$*'." else - >&2 echo "Test failure: failed to create an IPEX 2.1.20 notebook." + >&2 echo "Test failure: failed to remove notebook with '$*'." exit 1 fi } help_function() { echo "This script is used for generic tests related to DSS" - echo "Usage: check_dss.sh " + echo "Usage: check_dss.sh [args]..." echo echo "Test cases currently implemented:" - echo -e "\t: check_dss_can_be_initialized" - echo -e "\t: check_dss_namespace_is_deployed" + echo -e "\t: check_dss_can_be_initialized" + echo -e "\t: check_dss_namespace_is_deployed" echo -e "\t: check_mlflow_status_is_ready" - echo -e "\t: check_mlflow_is_deployed_as_first_service" echo -e "\t: check_dss_has_intel_gpu_acceleration_enabled" - echo -e "\t: check_dss_can_create_itex_215_notebook" - echo -e "\t: check_dss_can_create_ipex_2120_notebook" + echo -e "\t: check_dss_has_nvidia_gpu_acceleration_enabled" + echo -e "\t: check_dss_can_create_notebook [args]" + echo -e "\t: check_dss_can_remove_notebook " } main() { + pushd "${HOME}" case ${1} in - dss_can_be_initialized) check_dss_can_be_initialized ;; - dss_namespace_is_deployed) check_dss_namespace_is_deployed ;; - mlflow_status_is_ready) check_mlflow_status_is_ready ;; - mlflow_is_deployed_as_first_service) check_mlflow_is_deployed_as_first_service ;; - intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;; - can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;; - can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;; + can_be_initialized) check_dss_can_be_initialized ;; + namespace_is_deployed) check_dss_namespace_is_deployed ;; + mlflow_status_is_ready) check_dss_status_contains "MLflow deployment: Ready" ;; + intel_gpu_acceleration_is_enabled) check_dss_status_contains "Intel GPU acceleration: Enabled.*" ;; + nvidia_gpu_acceleration_is_enabled) check_dss_status_contains "NVIDIA GPU acceleration: Enabled.*" ;; + can_create_notebook) check_dss_can_create_notebook "${@:2}" ;; + can_remove_notebook) check_dss_can_remove_notebook "${@:2}" ;; *) help_function ;; esac + popd } main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh index c43f87445f..d0754acfea 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh @@ -2,18 +2,16 @@ set -euxo pipefail -check_host_has_intel_gpus() { - result=$(intel_gpu_top -L) - if [[ ${result} == *"pci:vendor=8086"* ]]; then - echo "Test success: Intel GPU available on host: ${result}" - else - >&2 echo "Test failure: "intel_gpu_top -L" reports no Intel GPUs: ${result}" - exit 1 - fi -} +# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation +SLOTS_PER_GPU=10 check_intel_gpu_plugin_can_be_installed() { - # Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453 + if microk8s.kubectl get daemonset.apps | grep -q "intel-gpu-plugin"; then + echo "Test success: 'intel-gpu-plugin' daemonset is already deployed!" + exit 0 + fi + + # NOTE: Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453 # TODO: make version a param VERSION=v0.30.0 @@ -22,7 +20,7 @@ check_intel_gpu_plugin_can_be_installed() { kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION} | tee /tmp/node_feature_discovery.yaml >/dev/null kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION} | tee /tmp/node_feature_rules.yaml >/dev/null kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION} | tee /tmp/gpu_plugin.yaml >/dev/null - sed -i 's/enable-monitoring/enable-monitoring\n - -shared-dev-num=10/' /tmp/gpu_plugin.yaml + sed -i "s/enable-monitoring/enable-monitoring\n - -shared-dev-num=${SLOTS_PER_GPU}/" /tmp/gpu_plugin.yaml kubectl apply -f /tmp/node_feature_discovery.yaml kubectl apply -f /tmp/node_feature_rules.yaml kubectl apply -f /tmp/gpu_plugin.yaml @@ -77,6 +75,8 @@ check_intel_gpu_node_label_is_attached() { } check_at_least_one_intel_gpu_is_available() { + # IMPORTANT NOTE: this test also counts NVIDIA GPUs once their plugin is enabled. + # The inaccuracy in gpu.intel.com label's value and not controlled by us result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') if [ "${result}" -ge 1 ]; then echo "Test success: Found ${result} GPUs on system." @@ -87,29 +87,21 @@ check_at_least_one_intel_gpu_is_available() { } check_capacity_slots_for_intel_gpus_match() { - num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}') - # IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation - SLOTS_PER_GPU=10 - total_slots=$((num_gpus * SLOTS_PER_GPU)) - if [ "${total_slots}" -eq "${result}" ]; then + if [ "${result}" -ge "${SLOTS_PER_GPU}" ]; then echo "Test success: Found ${result} GPU capacity slots on k8s node." else - >&2 echo "Test failure: expected ${total_slots} GPU capacity slots but got ${result}" + >&2 echo "Test failure: expected more than ${SLOTS_PER_GPU} GPU capacity slots but got ${result}" exit 1 fi } check_allocatable_slots_for_intel_gpus_match() { - num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.allocatable.gpu\.intel\.com/i915}') - # IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation - SLOTS_PER_GPU=10 - total_slots=$((num_gpus * SLOTS_PER_GPU)) - if [ "${total_slots}" -eq "${result}" ]; then + if [ "${result}" -ge "${SLOTS_PER_GPU}" ]; then echo "Test success: Found ${result} GPU allocatable slots on k8s node." else - >&2 echo "Test failure: expected ${total_slots} GPU allocatable slots but got ${result}" + >&2 echo "Test failure: expected ${SLOTS_PER_GPU} GPU allocatable slots but got ${result}" exit 1 fi } @@ -119,7 +111,6 @@ help_function() { echo "Usage: check.sh " echo echo "Test cases currently implemented:" - echo -e "\t: check_host_has_intel_gpus" echo -e "\t: check_intel_gpu_plugin_can_be_installed" echo -e "\t: check_intel_gpu_plugin_daemonset_is_deployed" echo -e "\t: check_one_intel_gpu_plugin_daemonset_is_available" @@ -132,7 +123,6 @@ help_function() { main() { case ${1} in - host_has_intel_gpus) check_host_has_intel_gpus ;; gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;; gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;; one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;; diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_ipex.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_ipex.sh deleted file mode 100755 index e0b7c4d5ea..0000000000 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_ipex.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env bash - -set -euxo pipefail - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) - -check_ipex_can_be_imported() { - echo "Starting ipex import test" - script="import intel_extension_for_pytorch as ipex; import torch; import jupyter" - if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then - echo "PASS: Found module" - exit 0 - else - >&2 echo "FAIL: Did not find IPEX python module" - exit 1 - fi -} - -check_pytorch_can_use_xpu() { - echo "Starting ipex GPU check test" - script="$(cat "$SCRIPT_DIR/pytorch_can_use_xpu.py")" - gpu_grep_out=$(microk8s.kubectl -n dss exec "$1" -- python3 -c "$script" | grep "dev_type=.gpu" 2>&1) - if [[ -z ${gpu_grep_out} ]]; then - >&2 echo "FAIL: No GPU found" - exit 1 - else - echo "PASS: GPU found" - exit 0 - fi -} - -help_function() { - echo "This script is used for tests related to IPEX" - echo "Usage: check_dss.sh " - echo - echo "Test cases currently implemented:" - echo -e "\t: check_itex_can_be_imported" - echo -e "\t: check_pytorch_can_use_xpu" -} - -main() { - pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'ipex-2120-notebook\S*') - echo "Found PyTorch pod: ${pod}" - case ${1} in - can_be_imported) check_ipex_can_be_imported "$pod" ;; - pytorch_can_use_xpu) check_pytorch_can_use_xpu "$pod" ;; - *) help_function ;; - esac -} - -main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_itex.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_itex.sh deleted file mode 100755 index 1a19b1cd01..0000000000 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_itex.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash - -set -euxo pipefail - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) - -check_itex_can_be_imported() { - echo "Starting itex import test" - script="import intel_extension_for_tensorflow as itex; import tensorflow; import jupyter" - if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then - echo "PASS: Found module" - exit 0 - else - >&2 echo "FAIL: Did not find ITEX python module" - exit 1 - fi -} - -check_tensorflow_can_use_xpu() { - echo "Starting itex GPU check test" - script="$(cat "$SCRIPT_DIR/tensorflow_can_use_xpu.py")" - if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then - echo "PASS: XPU found" - exit 0 - else - >&2 echo "FAIL: No XPU found" - exit 1 - fi -} - -help_function() { - echo "This script is used for tests related to ITEX" - echo "Usage: check_dss.sh " - echo - echo "Test cases currently implemented:" - echo -e "\t: check_itex_can_be_imported" - echo -e "\t: check_tensorflow_can_use_xpu" -} - -main() { - pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'itex-215-notebook\S*') - echo "Found Tensorflow pod: ${pod}" - case ${1} in - can_be_imported) check_itex_can_be_imported "$pod" ;; - tensorflow_can_use_xpu) check_tensorflow_can_use_xpu "$pod" ;; - *) help_function ;; - esac -} - -main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_notebook.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_notebook.sh new file mode 100755 index 0000000000..2f49ebc7ee --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_notebook.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +pytorch_can_use_cpu_script="import torch; print(torch.__version__)" +tensorflow_can_use_cpu_script="import tensorflow as tf; print(tf.config.experimental.list_physical_devices())" +pytorch_can_use_xpu_script="$(cat "$SCRIPT_DIR/pytorch_can_use_xpu.py")" +tensorflow_can_use_xpu_script="$(cat "$SCRIPT_DIR/tensorflow_can_use_xpu.py")" +pytorch_can_use_cuda_script="import torch; assert torch.cuda.is_available(), 'CUDA is not available'" +tensorflow_can_use_cuda_script="$(cat "$SCRIPT_DIR/tensorflow_can_use_cuda.py")" + +check_notebook_can_run_python_script_in_pod() { + if microk8s.kubectl -n dss exec "$1" -- python -c "$2"; then + echo "Test success: in pod $1" + else + err_code=$? + >&2 echo "Test failed: in pod $1 with error code ${err_code}" + exit $err_code + fi +} + +help_function() { + echo "This script is used for tests related to running things in notebooks' pods" + echo "Usage: check_notebook.sh [args]..." + echo + echo "Test cases currently implemented:" + echo -e "\t: check_notebook_can_run_python_script_in_pod