-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add optimum-tpu TGI v0.2.3 (#139)
* feat: add optimum-tpu TGI v0.2.3 The main feature is the addition of Llama 3.1, 3.2 and 3.3 (text-only) models. * fix: remove * when copying entrypoint * review(TGI TPU): add a comment on why we install two python versions
- Loading branch information
1 parent
1c31c51
commit 0232c42
Showing
3 changed files
with
246 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
# Enable GCP integration by default | ||
ARG ENABLE_GOOGLE_FEATURE=1 | ||
|
||
# Fetch and extract the TGI sources | ||
FROM alpine AS tgi | ||
# TGI version 2.4.1 by default | ||
ARG TGI_VERSION=v2.4.1 | ||
RUN test -n ${TGI_VERSION:?} | ||
RUN mkdir -p /tgi | ||
ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz | ||
RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1 | ||
|
||
# Build cargo components (adapted from TGI original Dockerfile) | ||
# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04) | ||
FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef | ||
WORKDIR /usr/src | ||
|
||
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse | ||
|
||
FROM chef AS planner | ||
COPY --from=tgi /tgi/Cargo.toml Cargo.toml | ||
COPY --from=tgi /tgi/Cargo.lock Cargo.lock | ||
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml | ||
COPY --from=tgi /tgi/proto proto | ||
COPY --from=tgi /tgi/benchmark benchmark | ||
COPY --from=tgi /tgi/router router | ||
COPY --from=tgi /tgi/backends backends | ||
COPY --from=tgi /tgi/launcher launcher | ||
RUN cargo chef prepare --recipe-path recipe.json | ||
|
||
FROM chef AS builder | ||
ARG ENABLE_GOOGLE_FEATURE | ||
RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}" | ||
|
||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ | ||
python3.11-dev | ||
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ | ||
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ | ||
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ | ||
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ | ||
rm -f $PROTOC_ZIP | ||
|
||
COPY --from=planner /usr/src/recipe.json recipe.json | ||
RUN cargo chef cook --profile release-opt --recipe-path recipe.json | ||
|
||
COPY --from=tgi /tgi/Cargo.toml Cargo.toml | ||
COPY --from=tgi /tgi/Cargo.lock Cargo.lock | ||
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml | ||
COPY --from=tgi /tgi/proto proto | ||
COPY --from=tgi /tgi/benchmark benchmark | ||
COPY --from=tgi /tgi/router router | ||
COPY --from=tgi /tgi/backends backends | ||
COPY --from=tgi /tgi/launcher launcher | ||
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \ | ||
cargo build --profile release-opt --features google; \ | ||
else \ | ||
cargo build --profile release-opt; \ | ||
fi | ||
|
||
# Python base image | ||
FROM ubuntu:22.04 AS base | ||
|
||
RUN apt-get update -y \ | ||
&& apt-get install -y --no-install-recommends \ | ||
python3-pip \ | ||
python3-setuptools \ | ||
python-is-python3 \ | ||
git \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& apt-get clean | ||
RUN pip3 --no-cache-dir install --upgrade pip | ||
|
||
ARG ENABLE_GOOGLE_FEATURE | ||
ARG VERSION='0.2.3' | ||
RUN test -n ${VERSION:?} | ||
|
||
FROM base AS optimum-tpu-installer | ||
|
||
COPY . /tmp/src | ||
|
||
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \ | ||
# If we are building for GCP, we need to clone the optimum-tpu repo as this is built from the huggingface/Google-Cloud-Containers repository and not the huggingface/optimum-tpu repository | ||
git clone https://github.com/huggingface/optimum-tpu.git /opt/optimum-tpu && \ | ||
cd /opt/optimum-tpu && git checkout v${VERSION}; \ | ||
fi && \ | ||
# Check if the optimum-tpu repo is cloned properly | ||
cp -a /tmp/src /opt/optimum-tpu && \ | ||
if [ ! -d "/opt/optimum-tpu/optimum" ]; then \ | ||
echo "Error: Building from incorrect repository. This build must be run from optimum-tpu repo. If building from google-cloud-containers repo, set ENABLE_GOOGLE_FEATURE=1 to automatically clone optimum-tpu" && \ | ||
exit 1; \ | ||
fi | ||
|
||
|
||
# Python server build image | ||
FROM base AS pyserver | ||
|
||
RUN apt-get update -y \ | ||
&& apt-get install -y --no-install-recommends \ | ||
make \ | ||
python3-venv \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& apt-get clean | ||
|
||
RUN install -d /pyserver | ||
WORKDIR /pyserver | ||
COPY --from=optimum-tpu-installer /opt/optimum-tpu/text-generation-inference/server server | ||
COPY --from=tgi /tgi/proto proto | ||
RUN pip3 install -r server/build-requirements.txt | ||
RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSION} make -C server gen-server | ||
|
||
# TPU base image (used for deployment) | ||
FROM base AS tpu_base | ||
|
||
ARG VERSION=${VERSION} | ||
|
||
# Install system prerequisites | ||
# NOTE: we need both python3.10 and python3.11 to be installed, as the TGI router uses python 3.11 and optimum-tpu uses | ||
# python 3.10. This has been fixed on newest version of optimum-tpu and will be removed in the next version (see | ||
# https://github.com/huggingface/optimum-tpu/pull/135 for details). | ||
RUN apt-get update -y \ | ||
&& apt-get install -y --no-install-recommends \ | ||
libpython3.10 \ | ||
libpython3.11 \ | ||
python3.11 \ | ||
git \ | ||
gnupg2 \ | ||
wget \ | ||
curl \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& apt-get clean | ||
|
||
# Update pip | ||
RUN pip install --upgrade pip | ||
|
||
# Install HuggingFace packages | ||
ARG TRANSFORMERS_VERSION='4.46.3' | ||
ARG ACCELERATE_VERSION='1.1.1' | ||
ARG SAFETENSORS_VERSION='0.4.5' | ||
|
||
ARG ENABLE_GOOGLE_FEATURE | ||
|
||
ENV HF_HUB_ENABLE_HF_TRANSFER=1 | ||
ENV VERSION=${VERSION} | ||
|
||
ENV PORT=${ENABLE_GOOGLE_FEATURE:+8080} | ||
ENV PORT=${PORT:-80} | ||
|
||
ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp} | ||
ENV HF_HOME=${HF_HOME:-/data} | ||
|
||
# Install requirements for TGI, that uses python3.11 | ||
RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION} | ||
|
||
# Install requirements for optimum-tpu, then for TGI then optimum-tpu | ||
RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer | ||
COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu | ||
RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes | ||
RUN python3 -m pip install -e /opt/optimum-tpu \ | ||
-f https://storage.googleapis.com/libtpu-releases/index.html | ||
|
||
# Install benchmarker | ||
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark | ||
# Install router | ||
COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router | ||
# Install launcher | ||
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher | ||
# Install python server | ||
COPY --from=pyserver /pyserver/build/dist dist | ||
RUN pip install dist/text_generation_server*.tar.gz | ||
|
||
|
||
# TPU compatible image for Inference Endpoints | ||
FROM tpu_base AS inference-endpoint | ||
|
||
COPY text-generation-inference/docker/entrypoint.sh entrypoint.sh | ||
RUN chmod +x entrypoint.sh | ||
ENTRYPOINT ["./entrypoint.sh"] | ||
|
||
FROM tpu_base AS google-cloud-containers | ||
|
||
# Install Google specific components if ENABLE_GOOGLE_FEATURE is set | ||
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \ | ||
apt-get update && \ | ||
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ | ||
ca-certificates \ | ||
curl \ | ||
git && \ | ||
rm -rf /var/lib/apt/lists/* && \ | ||
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ | ||
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ | ||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ | ||
| apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ | ||
apt-get update -y && \ | ||
apt-get install google-cloud-sdk -y; \ | ||
fi | ||
|
||
# Custom entrypoint for Google | ||
COPY --chmod=775 containers/tgi/tpu/${VERSION}/entrypoint.sh entrypoint.sh | ||
ENTRYPOINT ["./entrypoint.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
|
||
# This is required by GKE, see | ||
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#privileged-mode | ||
ulimit -l 68719476736 | ||
|
||
# Check if MODEL_ID starts with "gs://" | ||
if [[ $AIP_STORAGE_URI == gs://* ]]; then | ||
echo "AIP_STORAGE_URI set and starts with 'gs://', proceeding to download from GCS." | ||
echo "AIP_STORAGE_URI: $AIP_STORAGE_URI" | ||
|
||
# Define the target directory | ||
TARGET_DIR="/tmp/model" | ||
mkdir -p "$TARGET_DIR" | ||
|
||
# Use gsutil to copy the content from GCS to the target directory | ||
echo "Running: gcloud storage storage cp $AIP_STORAGE_URI/* $TARGET_DIR --recursive" | ||
gcloud storage cp "$AIP_STORAGE_URI/*" "$TARGET_DIR" --recursive | ||
|
||
# Check if gsutil command was successful | ||
if [ $? -eq 0 ]; then | ||
echo "Model downloaded successfully to ${TARGET_DIR}." | ||
# Update MODEL_ID to point to the local directory | ||
echo "Updating MODEL_ID to point to the local directory." | ||
export MODEL_ID="$TARGET_DIR" | ||
else | ||
echo "Failed to download model from GCS." | ||
exit 1 | ||
fi | ||
fi | ||
|
||
if [[ -z "${MAX_BATCH_SIZE}" ]]; then | ||
# Default to a batch size of 4 if no value is provided | ||
export MAX_BATCH_SIZE="4" | ||
fi | ||
|
||
if [[ -n "${QUANTIZATION}" ]]; then | ||
# If quantization is set, we use jetstream_int8 (this is the only option supported by optimum-tpu at the moment) | ||
QUANTIZATION="jetstream_int8" | ||
export QUANTIZATION="${QUANTIZATION}" | ||
fi | ||
|
||
ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases' | ||
|
||
exec text-generation-launcher $@ |