Skip to content

Commit

Permalink
exercised older CUDA and mig a100 use case more ; added pytorch insta…
Browse files Browse the repository at this point in the history
…llation functionality
  • Loading branch information
cjac committed Jan 16, 2025
1 parent 5a37d94 commit 7662215
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 43 deletions.
112 changes: 78 additions & 34 deletions templates/gpu/install_functions
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
function set_cudnn_version() {
readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"

# Parameters for NVIDIA-provided cuDNN library
DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
readonly DEFAULT_CUDNN_VERSION
CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
# The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
if ( is_rocky && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
# cuDNN v8 is not distribution for ubuntu20+, debian12
CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
Expand Down Expand Up @@ -303,30 +304,6 @@ function install_nvidia_nccl() {

local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"

# https://github.com/NVIDIA/nccl/blob/master/README.md
# https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
# Fermi: SM_20, compute_30
# Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
# Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
# Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62

# The following architectures are suppored by open kernel driver
# Volta: SM_70,SM_72, compute_70,compute_72
# Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87

# The following architectures are supported by CUDA v11.8+
# Ada: SM_89, compute_89
# Hopper: SM_90,SM_90a compute_90,compute_90a
# Blackwell: SM_100, compute_100
NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
if version_gt "${CUDA_VERSION}" "11.6" ; then
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
if version_ge "${CUDA_VERSION}" "11.8" ; then
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
if version_ge "${CUDA_VERSION}" "12.0" ; then
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi

mkdir -p "${workdir}"
pushd "${workdir}"

Expand All @@ -347,6 +324,30 @@ function install_nvidia_nccl() {
local local_tarball="${workdir}/${build_tarball}"
local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"

# https://github.com/NVIDIA/nccl/blob/master/README.md
# https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
# Fermi: SM_20, compute_30
# Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
# Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
# Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62

# The following architectures are suppored by open kernel driver
# Volta: SM_70,SM_72, compute_70,compute_72
# Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87

# The following architectures are supported by CUDA v11.8+
# Ada: SM_89, compute_89
# Hopper: SM_90,SM_90a compute_90,compute_90a
# Blackwell: SM_100, compute_100
NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
if version_gt "${CUDA_VERSION}" "11.6" ; then
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
if version_ge "${CUDA_VERSION}" "11.8" ; then
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
if version_ge "${CUDA_VERSION}" "12.0" ; then
NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi

output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
if echo "${output}" | grep -q "${gcs_tarball}" ; then
# cache hit - unpack from cache
Expand All @@ -369,11 +370,12 @@ function install_nvidia_nccl() {
export NVCC_GENCODE
execute_with_retries make -j$(nproc) pkg.redhat.build
fi
tar czvf "/${local_tarball}" "../${build_path}"
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
rm "${local_tarball}"
tar czvf "${local_tarball}" "../${build_path}"
make clean
popd
tar xzvf "${local_tarball}"
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
rm "${local_tarball}"
fi
gcloud storage cat "${gcs_tarball}" | tar xz
}
Expand Down Expand Up @@ -415,16 +417,16 @@ function install_nvidia_cudnn() {
apt-get -y install nvidia-cudnn
else
if is_cudnn8 ; then
install_local_cudnn8_repo
add_repo_cuda

apt-get update -qq
# Ignore version requested and use the latest version in the package index
cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"

execute_with_retries \
apt-get -y install --no-install-recommends \
"libcudnn8=${cudnn_pkg_version}" \
"libcudnn8-dev=${cudnn_pkg_version}"

uninstall_local_cudnn8_repo
sync
elif is_cudnn9 ; then
install_cuda_keyring_pkg
Expand Down Expand Up @@ -452,6 +454,48 @@ function install_nvidia_cudnn() {
mark_complete cudnn
}

function install_pytorch() {
if test -f "${workdir}/complete/pytorch" ; then return ; fi
local env
env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
local mc3=/opt/conda/miniconda3
local envpath="${mc3}/envs/${env}"
# Set numa node to 0 for all GPUs
for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
local verb=create
if test -d "${envpath}" ; then verb=install ; fi

readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
case "${INCLUDE_PYTORCH^^}" in
"1" | "YES" | "TRUE" )
local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
local local_tarball="${workdir}/${build_tarball}"
local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"

output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
if echo "${output}" | grep -q "${gcs_tarball}" ; then
# cache hit - unpack from cache
echo "cache hit"
mkdir -p "${envpath}"
gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
else
cudart_spec="cuda-cudart"
if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
"${mc3}/bin/mamba" "${verb}" -n "${env}" \
-c conda-forge -c nvidia -c rapidsai \
numba pytorch tensorflow[and-cuda] rapids pyspark \
"cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
pushd "${envpath}"
tar czf "${local_tarball}" .
popd
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
fi
;;
* ) echo "skip pytorch install" ;;
esac
touch "${workdir}/complete/pytorch"
}

function add_nonfree_components() {
if is_src_nvidia ; then return; fi
if ge_debian12 ; then
Expand Down
55 changes: 49 additions & 6 deletions templates/gpu/mig_functions
Original file line number Diff line number Diff line change
Expand Up @@ -65,38 +65,81 @@ function configure_mig_cgi() {
function enable_mig() {
is_complete enable-mig && return

# Start persistenced if it's not already running
# if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
# All devices on the same numa node
for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
# Write an ascii zero to the numa node indicator
echo "0" | dd of="${f}" status=none
done

echo "Stopping services and kernel modules in preparation for enabling mig."
if ( ps auwx | grep -i nvidia\\-persistenced ) ; then killall -9 nvidia-persistenced ; fi

# nvidia-smi --query-compute-apps=pid --format=csv,noheader
for svc in resourcemanager nodemanager; do
if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
systemctl stop "hadoop-yarn-${svc}.service"
fi
done
# can lsof be used to determine what processes have a file with name =~ /nvidia/ under the /dev/ directory ?
# if so, stop the service which launches the process with the open filehandle

MIG_GPU_LIST="`nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n ""`"
NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"

# root@cluster-1718310842-m:/tmp# for m in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do sudo rmmod $m ; done
# rmmod: ERROR: Module nvidia_drm is not currently loaded
# rmmod: ERROR: Module nvidia_modeset is not currently loaded
# rmmod: ERROR: Module nvidia_uvm is not currently loaded
# rmmod: ERROR: Module nvidia is not currently loaded
# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --gpu-reset
# Resetting GPU 00000000:00:04.0 is not supported.
# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --multi-instance-gpu=1
# Warning: MIG mode is in pending enable state for GPU 00000000:00:04.0:Not Supported
# Reboot the system or try nvidia-smi --gpu-reset to make MIG mode effective on GPU 00000000:00:04.0
# All done.
# root@cluster-1718310842-m:/tmp# echo $?
# 0
# root@cluster-1718310842-m:/tmp# /usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader
# Disabled

if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then

time nvsmi --gpu-reset || { # 30s
echo "unable to reset gpu. Trying to stop services and kernel modules which may have a lock."
# TODO: find a way to reset the A100 without reboot
removed="1"
for tryno in {1..25} ; do ; removed="1"
for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do
if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done
if [[ "${removed}" == "1" ]] ; then
echo "modules removed successfully"
nvsmi --gpu-reset
break
nvsmi --gpu-reset && break
fi
done
}
nvsmi -mig 1

if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
if version_le "${CUDA_VERSION}" "11.6" ; then
nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
else
nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
fi
done
fi
if test -n "$(nvsmi -L)" ; then
# cache the result of the gpu query
ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
chmod a+r "/var/run/nvidia-gpu-index.txt"
fi
for svc in resourcemanager nodemanager; do
if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
systemctl start "hadoop-yarn-${svc}.service"
fi
done
clear_nvsmi_cache
# Start persistenced if it's not already running
if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi

mark_complete enable-mig
}
Expand Down
7 changes: 4 additions & 3 deletions templates/gpu/util_functions
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,11 @@ function prepare_gpu_env(){
readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
fi

# Verify SPARK compatability
RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
# Set variables from metadata
RAPIDS_RUNTIME="$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")"
INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
readonly RAPIDS_RUNTIME INCLUDE_GPUS
INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')"
readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH

# determine whether we have nvidia-smi installed and working
nvsmi
Expand Down

0 comments on commit 7662215

Please sign in to comment.