feat: Add vLLM deployment to Helm chart

huggingface · Oct 10, 2024 · 15842e9 · 15842e9
1 parent 9e52ea1
commit 15842e9
Show file tree

Hide file tree

Showing 5 changed files with 125 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -36,6 +36,8 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
       * [Dataset configuration](#dataset-configuration)
       * [Prompt configuration](#prompt-configuration)
     * [Decode options](#decode-options)
+  * [Deploy on Kubernetes](#deploy-on-kubernetes)
+  * [Deploy on Slurm](#deploy-on-slurm)
   * [Development](#development)
   * [Frequently Asked Questions](#frequently-asked-questions)
   * [TODO](#todo)
@@ -241,6 +243,13 @@ $ make build
   If your CUDA graphs are not evenly distributed, you may see a performance drop at some request rates as batch size may
   fall in a bigger CUDA graph batch size leading to a lost of compute due to excessive padding.
 
+* **I get less tokens generated than expected in a benchmark.**
+
+  Inference servers use `max_tokens` parameter to limit the number of tokens generated. If the model
+  generates an end-of-sequence token before reaching `max_tokens`, the generation will stop.
+  There is currently no way to guarantee a fixed number of tokens generated without modifying the inference server.
+  So you may have `(successful requests) * max_tokens < generated tokens`.
+
 ## TODO
 
 - [X] Customizable token count and variance

diff --git a/extra/k8s/text-generation-inference-benchmark/templates/checks.yaml b/extra/k8s/text-generation-inference-benchmark/templates/checks.yaml
@@ -0,0 +1,3 @@
+{{- if and (.Values.tgi.enabled) (.Values.vllm.enabled) }}
+{{- fail "You cannot enable multiple inference engines at the same time. Please check you values" }}
+{{- end }}
diff --git a/extra/k8s/text-generation-inference-benchmark/templates/tgi.yaml b/extra/k8s/text-generation-inference-benchmark/templates/tgi.yaml
@@ -1,3 +1,4 @@
+{{- if .Values.tgi.enabled }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -47,7 +48,9 @@ spec:
           args:
             - "--model-id"
             - "{{ .Values.model_id }}"
-            {{- toYaml .Values.tgi.extra_args | nindent 12 }}
+            {{- with .Values.tgi.extra_args }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
           ports:
             - name: http
               containerPort: 8080
@@ -81,3 +84,4 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+{{- end }}
diff --git a/extra/k8s/text-generation-inference-benchmark/templates/vllm.yaml b/extra/k8s/text-generation-inference-benchmark/templates/vllm.yaml
@@ -0,0 +1,87 @@
+{{- if .Values.vllm.enabled }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "text-generation-inference-benchmark.fullname" . }}
+  labels:
+    app.kubernetes.io/component: text-generation-inference
+    {{- include "text-generation-inference-benchmark.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.vllm.replicaCount }}
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: text-generation-inference
+      {{- include "text-generation-inference-benchmark.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      {{- with .Values.vllm.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        app.kubernetes.io/component: text-generation-inference
+        {{- include "text-generation-inference-benchmark.labels" . | nindent 8 }}
+        {{- with .Values.vllm.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      securityContext:
+        {{- toYaml .Values.vllm.podSecurityContext | nindent 8 }}
+      containers:
+        - name: vllm
+          securityContext:
+            {{- toYaml .Values.vllm.securityContext | nindent 12 }}
+          image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default "latest" }}"
+          imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
+          env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "text-generation-inference-benchmark.fullname" . }}-hf-token
+                  key: HF_TOKEN
+          args:
+            - "--model"
+            - "{{ .Values.model_id }}"
+            - "--port"
+            - "8080"
+            {{- with .Values.vllm.extra_args }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          ports:
+            - name: http
+              containerPort: 8080
+              protocol: TCP
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8080
+            initialDelaySeconds: 10
+            periodSeconds: 3
+          resources:
+            {{- toYaml .Values.vllm.resources | nindent 12 }}
+          volumeMounts:
+            - name: shm
+              mountPath: /dev/shm
+      terminationGracePeriodSeconds: 10
+      volumes:
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+      {{- with .Values.vllm.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.vllm.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.vllm.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+{{- end }}
diff --git a/extra/k8s/text-generation-inference-benchmark/values.yaml b/extra/k8s/text-generation-inference-benchmark/values.yaml
@@ -6,8 +6,10 @@ fullnameOverride: ""
 
 hf_token: ""
 model_id: "meta-llama/Llama-3.1-8B-Instruct"
+server: tgi
 
 tgi:
+  enabled: true
   extra_args:
     - "--max-concurrent-requests"
     - "512"
@@ -27,6 +29,25 @@ tgi:
   tolerations: [ ]
   affinity: { }
 
+vllm:
+  enabled: false
+  extra_args:
+  image:
+    repository: vllm/vllm-openai
+    pullPolicy: IfNotPresent
+    tag: "latest"
+  replicaCount: 1
+  resources:
+    limits:
+      "nvidia.com/gpu": "1"
+  podAnnotations: { }
+  podLabels: { }
+  podSecurityContext: { }
+  securityContext: { }
+  nodeSelector: { }
+  tolerations: [ ]
+  affinity: { }
+
 benchmark:
   extra_args:
     - "--max-vus"