From c4eb92d7bdf7349b8194469f575021c1be8ecd04 Mon Sep 17 00:00:00 2001 From: Emma Date: Fri, 14 Jun 2024 20:14:08 +0800 Subject: [PATCH] [HugeCTR]Add a new base for hugectr (#1098) * Add ctr base image and install ctr components in merlin-hugectr * Use same version for pytorch base image * Correct torch python folder name for new version * Add merlin and test script for ctr-base * upate test script for new ctr-base and merlin-hugectr * Fix typo * Remove some packages hugectr maynot use * Add back keras since SOK uses it * Refactor dockerfiles * Correct relative path * Upgrade upstream image to 24.03 * Remove libboost which not in triton container * Add libhdf5-dev * Add execution privilege for test scripts * Remove unused test script * correct for base version --- ci/container_hugectr.sh | 20 +++ ci/test_container.sh | 9 +- docker/dockerfile.ctr | 108 +++++++++++-- docker/dockerfile.merlin.ctr | 301 +++++++++++++++++++++++++++++++++++ 4 files changed, 424 insertions(+), 14 deletions(-) create mode 100755 ci/container_hugectr.sh create mode 100644 docker/dockerfile.merlin.ctr diff --git a/ci/container_hugectr.sh b/ci/container_hugectr.sh new file mode 100755 index 000000000..dec62f458 --- /dev/null +++ b/ci/container_hugectr.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +container=$1 +devices=$2 + +echo "##############" +echo "# Unit tests #" +echo "##############" + +exit_code=0 + +## Test HugeCTR +if [ "$container" == "merlin-hugectr" ]; then + echo "Run unit tests for HugeCTR" + /hugectr/ci/test_unit.sh $container $devices || exit_code=1 + echo "Run unit tests for merlin-sok" + /hugectr/ci/test_unit.sh "merlin-tensorflow" $devices || exit_code=1 +fi + +exit $exit_code diff --git a/ci/test_container.sh b/ci/test_container.sh index d41b59fe2..cc3665af7 100755 --- a/ci/test_container.sh +++ b/ci/test_container.sh @@ -17,6 +17,11 @@ if [ $container != 'merlin-ci-runner' ]; then fi ${ci_script_dir}container_software.sh $container $devices -${ci_script_dir}container_integration.sh $container $devices $suppress_failures -${ci_script_dir}container_unit.sh $container $devices + +if [ $container == 'merlin-hugectr' ]; then + ${ci_script_dir}container_hugectr.sh $container $devices +elif [ $container != 'ctr-base' ]; then + ${ci_script_dir}container_integration.sh $container $devices $suppress_failures + ${ci_script_dir}container_unit.sh $container $devices +fi diff --git a/docker/dockerfile.ctr b/docker/dockerfile.ctr index b67e766b9..9a4f1d60e 100644 --- a/docker/dockerfile.ctr +++ b/docker/dockerfile.ctr @@ -1,16 +1,15 @@ # syntax=docker/dockerfile:1.2 -ARG MERLIN_VERSION=23.06 -ARG TRITON_VERSION=23.06 +ARG MERLIN_VERSION=24.06 +ARG TRITON_VERSION=24.03 -ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION} +ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/ctr-base:${MERLIN_VERSION} FROM ${BASE_IMAGE} as base ARG HUGECTR_VER=main ARG HUGECTR_BACKEND_VER=main -RUN pip install --no-cache-dir --upgrade notebook ipython -RUN pip install --no-cache-dir mpi4py +RUN pip install --no-cache-dir --upgrade notebook ipython mpi4py # Install CUDA-Aware hwloc ARG HWLOC_VER=2.4.1 @@ -45,22 +44,86 @@ ENV SHARP_COLL_NUM_COLL_GROUP_RESOURCE_ALLOC_THRESHOLD=0 ENV SHARP_COLL_LOCK_ON_COMM_INIT=1 ENV SHARP_COLL_LOG_LEVEL=3 ENV HCOLL_ENABLE_MCAST=0 +ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \ + SOK_COMPILE_UNIT_TEST=ON # link sub modules expected by hugectr cmake RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g') -# Install HugeCTR +# Optional dependency: Build and install protocol buffers and Hadoop/HDFS. +ARG INSTALL_HDFS=false +# Env for HDFS +ENV HADOOP_HOME=/opt/hadoop +ENV PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin \ + HDFS_NAMENODE_USER=root \ + HDFS_SECONDARYNAMENODE_USER=root \ + HDFS_DATANODE_USER=root \ + YARN_RESOURCEMANAGER_USER=root \ + YARN_NODEMANAGER_USER=root \ + # Tackles with ThreadReaper stack overflow issues: https://bugs.openjdk.java.net/browse/JDK-8153057 + LIBHDFS_OPTS='-Djdk.lang.processReaperUseDefaultStackSize=true' \ + # Tackles with JVM setting error signals that the UCX library checks (GitLab issue #425). + UCX_ERROR_SIGNALS='' \ + CLASSPATH=${CLASSPATH}:\ +${HADOOP_HOME}/etc/hadoop/*:\ +${HADOOP_HOME}/share/hadoop/common/*:\ +${HADOOP_HOME}/share/hadoop/common/lib/*:\ +${HADOOP_HOME}/share/hadoop/hdfs/*:\ +${HADOOP_HOME}/share/hadoop/hdfs/lib/*:\ +${HADOOP_HOME}/share/hadoop/mapreduce/*:\ +${HADOOP_HOME}/share/hadoop/yarn/*:\ +${HADOOP_HOME}/share/hadoop/yarn/lib/* + +# Install Inference and HPS Backend +ARG HUGECTR_DEV_MODE=false +ARG HUGECTR_VER=main +ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git" +ARG HUGECTR_BACKEND_VER=main +ARG _CI_JOB_TOKEN="" +ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git" ARG HUGECTR_HOME=/usr/local/hugectr -RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \ - rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \ - rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \ +ARG TRITON_VERSION + +ENV PATH=$PATH:${HUGECTR_HOME}/bin \ + CPATH=$CPATH:${HUGECTR_HOME}/include \ + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib + +RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \ + # Install HugeCTR inference which is dependency for hps_backend git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \ cd /hugectr && \ git submodule update --init --recursive && \ mkdir build && \ cd build && \ + if [[ "${INSTALL_HDFS}" == "false" ]]; then \ + cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON .. \ + ; else \ + cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON -DENABLE_HDFS=ON .. \ + ; fi && \ + make -j$(nproc) && \ + make install && \ + rm -rf ./* && \ + # Install hps_backend + git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \ + mkdir /repos/hugectr_triton_backend/hps_backend/build && \ + cd /repos/hugectr_triton_backend/hps_backend/build && \ + cmake \ + -DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \ + -DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \ + -DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \ + -DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \ + make -j$(nproc) && \ + make install && \ + chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hps/*.so && \ + cd ../../.. && \ + rm -rf hugectr_triton_backend && \ + # Remove the incompatible gmock and gtest installed by hps_backend + rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \ + rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \ + # Install HugeCTR multinode + cd /hugectr/build && \ LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \ export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \ if [[ "${INSTALL_HDFS}" == "false" ]]; then \ @@ -70,13 +133,34 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \ ; fi && \ make -j$(nproc) && \ make install && \ - rm -rf ./* && \ chmod +x ${HUGECTR_HOME}/bin/* ${HUGECTR_HOME}/lib/*.so && \ - cd ../onnx_converter && \ + # Install HPS trt pugin + cd ../hps_trt && \ + mkdir build && \ + cd build && \ + cmake -DSM="70;75;80;90" .. && \ + make -j$(nproc) && \ + make install && \ + cd ../../onnx_converter && \ python setup.py install && \ - mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \ + pip --no-cache-dir install ninja tf2onnx && \ + # Install SOK + cd ../sparse_operation_kit && \ + python setup.py install && \ + # Install HPS TF plugin + cd ../hps_tf && \ + python setup.py install && \ + # Install hps_torch + cd ../hps_torch/ && \ + TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 9.0" python setup.py install && \ + mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit/sparse_operation_kit ~/hugectr-sparse_operation_kit && \ + rm -rf /hugectr && mkdir -p /hugectr /hugectr/sparse_operation_kit && \ + mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit/sparse_operation_kit && \ + chmod +x /hugectr/ci/* /hugectr/sparse_operation_kit/sparse_operation_kit/* \ ; fi +RUN ln -s ${HUGECTR_HOME}/backends/hps /opt/tritonserver/backends/hps + ENV PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib # Clean up diff --git a/docker/dockerfile.merlin.ctr b/docker/dockerfile.merlin.ctr new file mode 100644 index 000000000..29d5d5fe7 --- /dev/null +++ b/docker/dockerfile.merlin.ctr @@ -0,0 +1,301 @@ +# syntax=docker/dockerfile:1.2 +ARG TRITON_VERSION=24.03 +ARG DLFW_VERSION=24.03 +ARG TORCH_VERSION=24.03 + +ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 +ARG SDK_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-sdk +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-min +ARG DLFW_IMAGE=nvcr.io/nvidia/tensorflow:${TRITON_VERSION}-tf2-py3 +ARG TORCH_IMAGE=nvcr.io/nvidia/pytorch:${TRITON_VERSION}-py3 + +FROM ${FULL_IMAGE} as triton +FROM ${SDK_IMAGE} as sdk +FROM ${DLFW_IMAGE} as dlfw +FROM ${TORCH_IMAGE} as torch +FROM ${BASE_IMAGE} as build + +# Args +ARG TARGETOS +ARG TARGETARCH + +# Envs +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_PATH=$CUDA_HOME +ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib +ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin + +# Set up NVIDIA package repository +RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \ + apt clean && \ + curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + apt update -y --fix-missing && \ + apt install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + ca-certificates \ + clang-format \ + curl \ + datacenter-gpu-manager \ + git \ + libarchive-dev \ + libb64-dev \ + libboost-dev \ + libcurl4-openssl-dev \ + libexpat1-dev \ + libopenblas-dev \ + libre2-dev \ + libsasl2-2 \ + libssl-dev \ + libtbb-dev \ + openssl \ + pkg-config \ + policykit-1 \ + protobuf-compiler \ + python3 \ + python3-pip \ + python3-dev \ + swig \ + rapidjson-dev \ + nlohmann-json3-dev \ + wget \ + zlib1g-dev \ + libhdf5-dev && \ + apt autoremove -y && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python + +# Install multiple packages + +# cmake 3.25.0 broke find_package(CUDAToolkit), which breaks the FAISS build: +# https://gitlab.kitware.com/cmake/cmake/-/issues/24119 +# A fix has already been merged but not yet released: +# https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7859 +# 2023-10-06: onnxruntime==1.15.1 the latest version changed api which is not compatible with hugectr +RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \ + nvidia-pyindex pybind11 pytest \ + tensorflow-metadata \ + "scikit-learn<1.2" \ + tritonclient[all] +RUN pip install --no-cache-dir protobuf==3.20.3 onnx onnxruntime==1.15.1 pycuda +RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com + +# Triton Server +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/TRITON_VERSION . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/NVIDIA_Deep_Learning_Container_License.pdf . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/bin bin/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/ +# NOTE 2023-09: fil-backend is not available on ARM. Some docker versions flag an error if there is +# not a single source file to copy. To avoid this, we als specify a small dummy file. +COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. + +ENV PATH=/opt/tritonserver/bin:${PATH}: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib + +# Clean up +RUN rm -rf /repos + +HEALTHCHECK NONE +CMD ["/bin/bash"] + +FROM ${BASE_IMAGE} as base + +# Args +ARG TARGETOS +ARG TARGETARCH + +# Envs +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_PATH=$CUDA_HOME +ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib +ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin + +# Set up NVIDIA package repository +RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \ + apt clean && \ + curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + apt update -y --fix-missing && \ + apt install -y --no-install-recommends \ + ca-certificates \ + clang-format \ + curl \ + libcurl4-openssl-dev \ + git \ + graphviz \ + libarchive-dev \ + libb64-dev \ + libboost-dev \ + libexpat1-dev \ + libopenblas-dev \ + libre2-dev \ + libsasl2-2 \ + libssl-dev \ + libtbb-dev \ + openssl \ + policykit-1 \ + protobuf-compiler \ + python3 \ + python3-pip \ + python3-dev \ + rapidjson-dev \ + tree \ + wget \ + zlib1g-dev \ + libhdf5-dev \ + # Required to build RocksDB and RdKafka. + libgflags-dev \ + libbz2-dev \ + libsnappy-dev \ + liblz4-dev \ + libzstd-dev \ + libsasl2-dev \ + # Required to build Protocol Buffers. + autoconf automake libtool \ + # Required to build Hadoop. + pkg-config \ + libpmem-dev \ + libsnappy-dev \ + # Required to run Hadoop. + openssh-server \ + # [ HugeCTR ] + libaio-dev && \ + apt autoremove -y && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python + +ENV JAVA_HOME=/usr/lib/jvm/default-java +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/lib:${JAVA_HOME}/lib/server + +# Binaries +COPY --chown=1000:1000 --from=build /usr/local/bin/cmake /usr/local/bin/ +COPY --chown=1000:1000 --from=build /usr/local/bin/pytest /usr/local/bin/ +COPY --chown=1000:1000 --from=sdk /usr/local/bin/perf_* /usr/local/bin/ + +# Triton Server +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/TRITON_VERSION . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/NVIDIA_Deep_Learning_Container_License.pdf . +COPY --chown=1000:1000 --from=triton /opt/tritonserver/bin bin/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/python/ +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/ +COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. +COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.3 /tmp +RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \ + mv /tmp/libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ + chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.3 && \ + ln -s libdcgm.so.3 /usr/lib/${ARCH}-linux-gnu/libdcgm.so + +ENV PATH=/opt/tritonserver/bin:${PATH}: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib + +# python --version | sed -e 's/[A-Za-z ]*//g' | awk -F'.' '{print $1"."$2}' +ENV PYTHON_VERSION=3.10 + +# Python Packages +COPY --chown=1000:1000 --from=build /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages/ +ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/ + + +# rapids components from the DLFW image +COPY --chown=1000:1000 --from=dlfw /usr/lib/libcudf* /usr/lib/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/libarrow* /usr/lib/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/libparquet* /usr/lib/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Arrow /usr/lib/cmake/Arrow/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Parquet /usr/lib/cmake/Parquet/ +COPY --chown=1000:1000 --from=dlfw /usr/lib/libnvcomp* /usr/lib/ + +COPY --chown=1000:1000 --from=dlfw /usr/include/fmt /usr/include/fmt/ +COPY --chown=1000:1000 --from=dlfw /usr/include/spdlog /usr/include/spdlog/ +COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/ +COPY --chown=1000:1000 --from=dlfw /usr/include/parquet /usr/include/parquet/ +COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/ +COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/ + +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends + + +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy.dist-info/ + +# Triton TF backends +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorflow backends/tensorflow/ + +# Tensorflow dependencies (only) +# Pinning to pass hugectr sok tests +# Need to install transformers after tensorflow has been pulled in, so it builds artifacts correctly. +# upgrade to 4.36.0 due to GHSA-3863-2447-669p +# Torch Metrics and Lightning (without torch) +RUN pip install --no-cache-dir tensorflow==2.14.0 protobuf==3.20.3 wrapt==1.14.0 transformers==4.36.0 \ + && pip uninstall tensorflow keras -y \ + && pip install --no-cache-dir --no-deps torch torchmetrics \ + && rm -rf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch \ + && rm -rf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/caffe2 + +# DLFW Tensorflow packages +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/bin/saved_model_cli /usr/local/bin/saved_model_cli +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/tensorflow/ /usr/local/lib/tensorflow/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod/ +COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod.dist-info/ +COPY --chown=1000:1000 --from=dlfw /usr/local/bin/horovodrun /usr/local/bin/horovodrun + +# Triton Torch backend +COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/pytorch backends/pytorch + +# Add all torch libraries to /usr/local +RUN ln -s /opt/tritonserver/backends/pytorch/* /usr/local/lib/ + +# DLFW Python packages +COPY --chown=1000:1000 --from=torch /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch +COPY --chown=1000:1000 --from=torch /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch.dist-info/ +COPY --chown=1000:1000 --from=torch /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorrt /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorrt +COPY --chown=1000:1000 --from=torch /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorrt-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorrt.dist-info/ + +ENV LIGHTFM_NO_CFLAGS=1 +RUN pip install --no-cache-dir jupyterlab notebook pydot testbook lightfm + +ENV JUPYTER_CONFIG_DIR=/tmp/.jupyter +ENV JUPYTER_DATA_DIR=/tmp/.jupyter +ENV JUPYTER_RUNTIME_DIR=/tmp/.jupyter + +ARG MERLIN_VER=main +ENV MERLIN_VER=${MERLIN_VER} + +# Add Merlin Repo +RUN git clone --branch ${MERLIN_VER} --depth 1 https://github.com/NVIDIA-Merlin/Merlin/ /Merlin && \ + cd /Merlin/ && pip install . --no-deps + +HEALTHCHECK NONE +CMD ["/bin/bash"] +ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]