Skip to content

Commit

Permalink
feat: add optimum-tpu TGI v0.2.3 (#139)
Browse files Browse the repository at this point in the history
* feat: add optimum-tpu TGI v0.2.3

The main feature is the addition of Llama 3.1, 3.2 and 3.3 (text-only)
models.

* fix: remove * when copying entrypoint

* review(TGI TPU): add a comment on why we install two python versions
  • Loading branch information
tengomucho authored Jan 9, 2025
1 parent 1c31c51 commit 0232c42
Show file tree
Hide file tree
Showing 3 changed files with 246 additions and 2 deletions.
4 changes: 2 additions & 2 deletions containers/tgi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ The TGI containers come with two different variants depending on the accelerator
docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 -f containers/tgi/gpu/2.3.1/Dockerfile .
```

- **TPU**: You can build TGI container for Google Cloud TPUs on any machine with docker build, you do not need to build it on a TPU VM
- **TPU**: You can build TGI container for Google Cloud TPUs on any machine with docker build, you do not need to build it on a TPU VM.

```bash
docker build --ulimit nofile=100000:100000 -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-tpu.0.2.2.py310 -f containers/tgi/tpu/0.2.2/Dockerfile .
docker build --ulimit nofile=100000:100000 -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-tpu.0.2.3.py310 -f containers/tgi/tpu/0.2.3/Dockerfile .
```
199 changes: 199 additions & 0 deletions containers/tgi/tpu/0.2.3/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# Enable GCP integration by default
ARG ENABLE_GOOGLE_FEATURE=1

# Fetch and extract the TGI sources
FROM alpine AS tgi
# TGI version 2.4.1 by default
ARG TGI_VERSION=v2.4.1
RUN test -n ${TGI_VERSION:?}
RUN mkdir -p /tgi
ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1

# Build cargo components (adapted from TGI original Dockerfile)
# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY --from=tgi /tgi/Cargo.toml Cargo.toml
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
COPY --from=tgi /tgi/proto proto
COPY --from=tgi /tgi/benchmark benchmark
COPY --from=tgi /tgi/router router
COPY --from=tgi /tgi/backends backends
COPY --from=tgi /tgi/launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
ARG ENABLE_GOOGLE_FEATURE
RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}"

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
python3.11-dev
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

COPY --from=tgi /tgi/Cargo.toml Cargo.toml
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
COPY --from=tgi /tgi/proto proto
COPY --from=tgi /tgi/benchmark benchmark
COPY --from=tgi /tgi/router router
COPY --from=tgi /tgi/backends backends
COPY --from=tgi /tgi/launcher launcher
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
cargo build --profile release-opt --features google; \
else \
cargo build --profile release-opt; \
fi

# Python base image
FROM ubuntu:22.04 AS base

RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
python3-pip \
python3-setuptools \
python-is-python3 \
git \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
RUN pip3 --no-cache-dir install --upgrade pip

ARG ENABLE_GOOGLE_FEATURE
ARG VERSION='0.2.3'
RUN test -n ${VERSION:?}

FROM base AS optimum-tpu-installer

COPY . /tmp/src

RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
# If we are building for GCP, we need to clone the optimum-tpu repo as this is built from the huggingface/Google-Cloud-Containers repository and not the huggingface/optimum-tpu repository
git clone https://github.com/huggingface/optimum-tpu.git /opt/optimum-tpu && \
cd /opt/optimum-tpu && git checkout v${VERSION}; \
fi && \
# Check if the optimum-tpu repo is cloned properly
cp -a /tmp/src /opt/optimum-tpu && \
if [ ! -d "/opt/optimum-tpu/optimum" ]; then \
echo "Error: Building from incorrect repository. This build must be run from optimum-tpu repo. If building from google-cloud-containers repo, set ENABLE_GOOGLE_FEATURE=1 to automatically clone optimum-tpu" && \
exit 1; \
fi


# Python server build image
FROM base AS pyserver

RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
make \
python3-venv \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

RUN install -d /pyserver
WORKDIR /pyserver
COPY --from=optimum-tpu-installer /opt/optimum-tpu/text-generation-inference/server server
COPY --from=tgi /tgi/proto proto
RUN pip3 install -r server/build-requirements.txt
RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSION} make -C server gen-server

# TPU base image (used for deployment)
FROM base AS tpu_base

ARG VERSION=${VERSION}

# Install system prerequisites
# NOTE: we need both python3.10 and python3.11 to be installed, as the TGI router uses python 3.11 and optimum-tpu uses
# python 3.10. This has been fixed on newest version of optimum-tpu and will be removed in the next version (see
# https://github.com/huggingface/optimum-tpu/pull/135 for details).
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
libpython3.10 \
libpython3.11 \
python3.11 \
git \
gnupg2 \
wget \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

# Update pip
RUN pip install --upgrade pip

# Install HuggingFace packages
ARG TRANSFORMERS_VERSION='4.46.3'
ARG ACCELERATE_VERSION='1.1.1'
ARG SAFETENSORS_VERSION='0.4.5'

ARG ENABLE_GOOGLE_FEATURE

ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV VERSION=${VERSION}

ENV PORT=${ENABLE_GOOGLE_FEATURE:+8080}
ENV PORT=${PORT:-80}

ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp}
ENV HF_HOME=${HF_HOME:-/data}

# Install requirements for TGI, that uses python3.11
RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION}

# Install requirements for optimum-tpu, then for TGI then optimum-tpu
RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer
COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu
RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes
RUN python3 -m pip install -e /opt/optimum-tpu \
-f https://storage.googleapis.com/libtpu-releases/index.html

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
# Install python server
COPY --from=pyserver /pyserver/build/dist dist
RUN pip install dist/text_generation_server*.tar.gz


# TPU compatible image for Inference Endpoints
FROM tpu_base AS inference-endpoint

COPY text-generation-inference/docker/entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]

FROM tpu_base AS google-cloud-containers

# Install Google specific components if ENABLE_GOOGLE_FEATURE is set
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
curl \
git && \
rm -rf /var/lib/apt/lists/* && \
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
| apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt-get update -y && \
apt-get install google-cloud-sdk -y; \
fi

# Custom entrypoint for Google
COPY --chmod=775 containers/tgi/tpu/${VERSION}/entrypoint.sh entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]
45 changes: 45 additions & 0 deletions containers/tgi/tpu/0.2.3/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

# This is required by GKE, see
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#privileged-mode
ulimit -l 68719476736

# Check if MODEL_ID starts with "gs://"
if [[ $AIP_STORAGE_URI == gs://* ]]; then
echo "AIP_STORAGE_URI set and starts with 'gs://', proceeding to download from GCS."
echo "AIP_STORAGE_URI: $AIP_STORAGE_URI"

# Define the target directory
TARGET_DIR="/tmp/model"
mkdir -p "$TARGET_DIR"

# Use gsutil to copy the content from GCS to the target directory
echo "Running: gcloud storage storage cp $AIP_STORAGE_URI/* $TARGET_DIR --recursive"
gcloud storage cp "$AIP_STORAGE_URI/*" "$TARGET_DIR" --recursive

# Check if gsutil command was successful
if [ $? -eq 0 ]; then
echo "Model downloaded successfully to ${TARGET_DIR}."
# Update MODEL_ID to point to the local directory
echo "Updating MODEL_ID to point to the local directory."
export MODEL_ID="$TARGET_DIR"
else
echo "Failed to download model from GCS."
exit 1
fi
fi

if [[ -z "${MAX_BATCH_SIZE}" ]]; then
# Default to a batch size of 4 if no value is provided
export MAX_BATCH_SIZE="4"
fi

if [[ -n "${QUANTIZATION}" ]]; then
# If quantization is set, we use jetstream_int8 (this is the only option supported by optimum-tpu at the moment)
QUANTIZATION="jetstream_int8"
export QUANTIZATION="${QUANTIZATION}"
fi

ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'

exec text-generation-launcher $@

0 comments on commit 0232c42

Please sign in to comment.