From 7cfd16fc86b47f35c5235892d67453bf24d817d3 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:22:06 +0200 Subject: [PATCH 1/6] Add `examples/gke/tgi-multi-gpu-deployment/deployment.yaml` --- .../tgi-multi-gpu-deployment/deployment.yaml | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 examples/gke/tgi-multi-gpu-deployment/deployment.yaml diff --git a/examples/gke/tgi-multi-gpu-deployment/deployment.yaml b/examples/gke/tgi-multi-gpu-deployment/deployment.yaml new file mode 100644 index 00000000..8357fe9f --- /dev/null +++ b/examples/gke/tgi-multi-gpu-deployment/deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tgi-deployment +spec: + replicas: 1 + selector: + matchLabels: + app: tgi-server + template: + metadata: + labels: + app: tgi-server + hf.co/model: Meta-Llama-3-8B-Instruct + hf.co/task: text-generation + annotations: + deployment.kubernetes.io/max-replicas: "1" + spec: + containers: + - name: tgi-container + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-0.ubuntu2204:latest + resources: + requests: + nvidia.com/gpu: 4 + env: + - name: MODEL_ID + value: google/gemma-7b-it + - name: REVISION + value: "8adab6a35fdbcdae0ae41ab1f711b1bc8d05727e" + - name: NUM_SHARD + value: "4" + - name: MAX_BATCH_PREFILL_TOKENS + value: "4096" + - name: MAX_INPUT_TOKENS + value: "4000" + - name: MAX_TOTAL_TOKENS + value: "4096" + - name: PORT + value: "8080" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_token + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 +--- +apiVersion: v1 +kind: Service +metadata: + name: tgi-service +spec: + selector: + app: tgi-server + type: ClusterIP + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 + From a95695a707a1fb114616298d2ae0c601cc24cc29 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Mon, 15 Jul 2024 15:35:58 +0200 Subject: [PATCH 2/6] Update `metadata` and increase `ephemeral-storage` --- examples/gke/tgi-multi-gpu-deployment/deployment.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/gke/tgi-multi-gpu-deployment/deployment.yaml b/examples/gke/tgi-multi-gpu-deployment/deployment.yaml index 8357fe9f..9a28a3ec 100644 --- a/examples/gke/tgi-multi-gpu-deployment/deployment.yaml +++ b/examples/gke/tgi-multi-gpu-deployment/deployment.yaml @@ -11,7 +11,7 @@ spec: metadata: labels: app: tgi-server - hf.co/model: Meta-Llama-3-8B-Instruct + hf.co/model: google--gemma-7b-it hf.co/task: text-generation annotations: deployment.kubernetes.io/max-replicas: "1" @@ -22,6 +22,7 @@ spec: resources: requests: nvidia.com/gpu: 4 + ephemeral-storage: "24Gi" env: - name: MODEL_ID value: google/gemma-7b-it From 53d90272a29fedf70025e3db924ea740c397ee3f Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:17:15 +0200 Subject: [PATCH 3/6] Update `deployment.yaml` to use `google/gemma-2-27b-it` instead --- .../gke/tgi-multi-gpu-deployment/deployment.yaml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/gke/tgi-multi-gpu-deployment/deployment.yaml b/examples/gke/tgi-multi-gpu-deployment/deployment.yaml index 9a28a3ec..faf9c2f9 100644 --- a/examples/gke/tgi-multi-gpu-deployment/deployment.yaml +++ b/examples/gke/tgi-multi-gpu-deployment/deployment.yaml @@ -11,23 +11,21 @@ spec: metadata: labels: app: tgi-server - hf.co/model: google--gemma-7b-it + hf.co/model: google--gemma-2-27b-it hf.co/task: text-generation - annotations: - deployment.kubernetes.io/max-replicas: "1" spec: containers: - name: tgi-container - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-0.ubuntu2204:latest + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204:latest resources: requests: nvidia.com/gpu: 4 - ephemeral-storage: "24Gi" + ephemeral-storage: "64Gi" env: - name: MODEL_ID - value: google/gemma-7b-it + value: google/gemma-2-27b-it - name: REVISION - value: "8adab6a35fdbcdae0ae41ab1f711b1bc8d05727e" + value: "2d74922e8a2961565b71fd5373081e9ecbf99c08" - name: NUM_SHARD value: "4" - name: MAX_BATCH_PREFILL_TOKENS @@ -66,4 +64,3 @@ spec: - protocol: TCP port: 8080 targetPort: 8080 - From 4a137e23ba41bfebed1aa120bdb93dd47ddfe60d Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Wed, 21 Aug 2024 08:34:15 +0200 Subject: [PATCH 4/6] Move `deployment.yaml` into `config/*.yaml` --- .../{ => config}/deployment.yaml | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) rename examples/gke/tgi-multi-gpu-deployment/{ => config}/deployment.yaml (78%) diff --git a/examples/gke/tgi-multi-gpu-deployment/deployment.yaml b/examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml similarity index 78% rename from examples/gke/tgi-multi-gpu-deployment/deployment.yaml rename to examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml index faf9c2f9..7b2dda84 100644 --- a/examples/gke/tgi-multi-gpu-deployment/deployment.yaml +++ b/examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml @@ -11,21 +11,19 @@ spec: metadata: labels: app: tgi-server - hf.co/model: google--gemma-2-27b-it + hf.co/model: google--gemma-7b-it hf.co/task: text-generation spec: containers: - name: tgi-container - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204:latest + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310:latest resources: requests: nvidia.com/gpu: 4 ephemeral-storage: "64Gi" env: - name: MODEL_ID - value: google/gemma-2-27b-it - - name: REVISION - value: "2d74922e8a2961565b71fd5373081e9ecbf99c08" + value: "google/gemma-7b-it" - name: NUM_SHARD value: "4" - name: MAX_BATCH_PREFILL_TOKENS @@ -51,16 +49,3 @@ spec: sizeLimit: 1Gi nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4 ---- -apiVersion: v1 -kind: Service -metadata: - name: tgi-service -spec: - selector: - app: tgi-server - type: ClusterIP - ports: - - protocol: TCP - port: 8080 - targetPort: 8080 From 706094f11474669130e59e8856a2b7a86edd0fba Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Wed, 21 Aug 2024 08:34:29 +0200 Subject: [PATCH 5/6] Add both `ingress.yaml` and `service.yaml` --- .../config/ingress.yaml | 17 +++++++++++++++++ .../config/service.yaml | 12 ++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 examples/gke/tgi-multi-gpu-deployment/config/ingress.yaml create mode 100644 examples/gke/tgi-multi-gpu-deployment/config/service.yaml diff --git a/examples/gke/tgi-multi-gpu-deployment/config/ingress.yaml b/examples/gke/tgi-multi-gpu-deployment/config/ingress.yaml new file mode 100644 index 00000000..2aac93a7 --- /dev/null +++ b/examples/gke/tgi-multi-gpu-deployment/config/ingress.yaml @@ -0,0 +1,17 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tgi-ingress + annotations: + kubernetes.io/ingress.class: "gce" +spec: + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: tgi-service + port: + number: 8080 diff --git a/examples/gke/tgi-multi-gpu-deployment/config/service.yaml b/examples/gke/tgi-multi-gpu-deployment/config/service.yaml new file mode 100644 index 00000000..2e3978ac --- /dev/null +++ b/examples/gke/tgi-multi-gpu-deployment/config/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: tgi-service +spec: + selector: + app: tgi-server + type: ClusterIP + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 From 448b3f3248f90d5141956590b29f532a20fafd21 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Mon, 26 Aug 2024 09:02:58 +0200 Subject: [PATCH 6/6] Temporary patch to be able to run Gemma2 --- examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml b/examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml index 7b2dda84..d7340b31 100644 --- a/examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml +++ b/examples/gke/tgi-multi-gpu-deployment/config/deployment.yaml @@ -16,7 +16,8 @@ spec: spec: containers: - name: tgi-container - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310:latest + image: ghcr.io/huggingface/text-generation-inference:sha-f852190 + # image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310:latest resources: requests: nvidia.com/gpu: 4