From 7662215766b520d006def883c8b0cf8dba440a1f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 16 Jan 2025 10:56:23 -0800 Subject: [PATCH] exercised older CUDA and mig a100 use case more ; added pytorch installation functionality --- templates/gpu/install_functions | 112 ++++++++++++++++++++++---------- templates/gpu/mig_functions | 55 ++++++++++++++-- templates/gpu/util_functions | 7 +- 3 files changed, 131 insertions(+), 43 deletions(-) diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions index 1ba76c236..8effce9b4 100644 --- a/templates/gpu/install_functions +++ b/templates/gpu/install_functions @@ -1,14 +1,15 @@ function set_cudnn_version() { - readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" + readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39" + readonly DEFAULT_CUDNN8_VERSION="8.3.1.22" readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" # Parameters for NVIDIA-provided cuDNN library DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} readonly DEFAULT_CUDNN_VERSION CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") - # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} - if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" + # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION} + if ( is_rocky && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then + CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}" elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then # cuDNN v8 is not distribution for ubuntu20+, debian12 CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" @@ -303,30 +304,6 @@ function install_nvidia_nccl() { local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - # https://github.com/NVIDIA/nccl/blob/master/README.md - # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Fermi: SM_20, compute_30 - # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 - # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 - # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 - - # The following architectures are suppored by open kernel driver - # Volta: SM_70,SM_72, compute_70,compute_72 - # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 - - # The following architectures are supported by CUDA v11.8+ - # Ada: SM_89, compute_89 - # Hopper: SM_90,SM_90a compute_90,compute_90a - # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" - if version_gt "${CUDA_VERSION}" "11.6" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi - if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi - if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi - mkdir -p "${workdir}" pushd "${workdir}" @@ -347,6 +324,30 @@ function install_nvidia_nccl() { local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" + if version_gt "${CUDA_VERSION}" "11.6" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache @@ -369,11 +370,12 @@ function install_nvidia_nccl() { export NVCC_GENCODE execute_with_retries make -j$(nproc) pkg.redhat.build fi - tar czvf "/${local_tarball}" "../${build_path}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" + tar czvf "${local_tarball}" "../${build_path}" make clean popd + tar xzvf "${local_tarball}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" fi gcloud storage cat "${gcs_tarball}" | tar xz } @@ -415,16 +417,16 @@ function install_nvidia_cudnn() { apt-get -y install nvidia-cudnn else if is_cudnn8 ; then - install_local_cudnn8_repo + add_repo_cuda apt-get update -qq + # Ignore version requested and use the latest version in the package index + cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" - - uninstall_local_cudnn8_repo sync elif is_cudnn9 ; then install_cuda_keyring_pkg @@ -452,6 +454,48 @@ function install_nvidia_cudnn() { mark_complete cudnn } +function install_pytorch() { + if test -f "${workdir}/complete/pytorch" ; then return ; fi + local env + env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') + local mc3=/opt/conda/miniconda3 + local envpath="${mc3}/envs/${env}" + # Set numa node to 0 for all GPUs + for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done + local verb=create + if test -d "${envpath}" ; then verb=install ; fi + + readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no') + case "${INCLUDE_PYTORCH^^}" in + "1" | "YES" | "TRUE" ) + local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + mkdir -p "${envpath}" + gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + cudart_spec="cuda-cudart" + if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi + "${mc3}/bin/mamba" "${verb}" -n "${env}" \ + -c conda-forge -c nvidia -c rapidsai \ + numba pytorch tensorflow[and-cuda] rapids pyspark \ + "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi + ;; + * ) echo "skip pytorch install" ;; + esac + touch "${workdir}/complete/pytorch" +} + function add_nonfree_components() { if is_src_nvidia ; then return; fi if ge_debian12 ; then diff --git a/templates/gpu/mig_functions b/templates/gpu/mig_functions index 7ec29aa25..7d94b7dcf 100644 --- a/templates/gpu/mig_functions +++ b/templates/gpu/mig_functions @@ -65,38 +65,81 @@ function configure_mig_cgi() { function enable_mig() { is_complete enable-mig && return - # Start persistenced if it's not already running -# if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi + # All devices on the same numa node for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do # Write an ascii zero to the numa node indicator echo "0" | dd of="${f}" status=none done + + echo "Stopping services and kernel modules in preparation for enabling mig." + if ( ps auwx | grep -i nvidia\\-persistenced ) ; then killall -9 nvidia-persistenced ; fi + # nvidia-smi --query-compute-apps=pid --format=csv,noheader for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then systemctl stop "hadoop-yarn-${svc}.service" fi done + # can lsof be used to determine what processes have a file with name =~ /nvidia/ under the /dev/ directory ? + # if so, stop the service which launches the process with the open filehandle + + MIG_GPU_LIST="`nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n ""`" + NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" + +# root@cluster-1718310842-m:/tmp# for m in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do sudo rmmod $m ; done +# rmmod: ERROR: Module nvidia_drm is not currently loaded +# rmmod: ERROR: Module nvidia_modeset is not currently loaded +# rmmod: ERROR: Module nvidia_uvm is not currently loaded +# rmmod: ERROR: Module nvidia is not currently loaded +# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --gpu-reset +# Resetting GPU 00000000:00:04.0 is not supported. +# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --multi-instance-gpu=1 +# Warning: MIG mode is in pending enable state for GPU 00000000:00:04.0:Not Supported +# Reboot the system or try nvidia-smi --gpu-reset to make MIG mode effective on GPU 00000000:00:04.0 +# All done. +# root@cluster-1718310842-m:/tmp# echo $? +# 0 +# root@cluster-1718310842-m:/tmp# /usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader +# Disabled + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + time nvsmi --gpu-reset || { # 30s - echo "unable to reset gpu. Trying to stop services and kernel modules which may have a lock." # TODO: find a way to reset the A100 without reboot + removed="1" for tryno in {1..25} ; do ; removed="1" for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done if [[ "${removed}" == "1" ]] ; then echo "modules removed successfully" - nvsmi --gpu-reset - break + nvsmi --gpu-reset && break fi done } - nvsmi -mig 1 + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do + if version_le "${CUDA_VERSION}" "11.6" ; then + nvsmi -i "${GPU_ID}" --multi-instance-gpu=1 + else + nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + fi + done + fi + if test -n "$(nvsmi -L)" ; then + # cache the result of the gpu query + ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') + echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" + chmod a+r "/var/run/nvidia-gpu-index.txt" + fi for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then systemctl start "hadoop-yarn-${svc}.service" fi done clear_nvsmi_cache + # Start persistenced if it's not already running + if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi mark_complete enable-mig } diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 565ec3ba0..eea7b3dd5 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -200,10 +200,11 @@ function prepare_gpu_env(){ readonly DEFAULT_RAPIDS_RUNTIME='SPARK' fi - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}") + # Set variables from metadata + RAPIDS_RUNTIME="$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")" INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")" - readonly RAPIDS_RUNTIME INCLUDE_GPUS + INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')" + readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH # determine whether we have nvidia-smi installed and working nvsmi