feat: rrelease a new preset version that supports vllm

Signed-off-by: jerryzhuang <[email protected]>
kaito-project · Nov 4, 2024 · 2711ee9 · 2711ee9
1 parent 5f2f649
commit 2711ee9
Show file tree

Hide file tree

Showing 23 changed files with 81 additions and 71 deletions.
diff --git a/pkg/inference/preset-inferences.go b/pkg/inference/preset-inferences.go
@@ -22,9 +22,9 @@ import (
 )
 
 const (
-	ProbePath     = "/healthz"
+	ProbePath     = "/health"
 	Port5000      = int32(5000)
-	InferenceFile = "inference_api.py"
+	InferenceFile = "/workspace/tfs/inference_api.py"
 )
 
 var (

diff --git a/pkg/inference/preset-inferences_test.go b/pkg/inference/preset-inferences_test.go
@@ -46,7 +46,7 @@ func TestCreatePresetInference(t *testing.T) {
 			workload: "Deployment",
 			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
 			// So expected cmd consists of shell command and inference file
-			expectedCmd: "/bin/sh -c  inference_api.py",
+			expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
 			hasAdapters: false,
 		},
 
@@ -58,7 +58,7 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
 			},
 			workload:    "StatefulSet",
-			expectedCmd: "/bin/sh -c  inference_api.py",
+			expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
 			hasAdapters: false,
 		},
 
@@ -69,7 +69,7 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
 			},
 			workload:       "Deployment",
-			expectedCmd:    "/bin/sh -c  inference_api.py",
+			expectedCmd:    "/bin/sh -c python3 /workspace/tfs/inference_api.py",
 			hasAdapters:    true,
 			expectedVolume: "adapter-volume",
 		},

diff --git a/pkg/utils/test/testModel.go b/pkg/utils/test/testModel.go
@@ -16,6 +16,7 @@ func (*testModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
 		ReadinessTimeout:    time.Duration(30) * time.Minute,
+		BaseCommand: "python3",
 	}
 }
 func (*testModel) GetTuningParameters() *model.PresetParam {
@@ -37,6 +38,7 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
 		ReadinessTimeout:    time.Duration(30) * time.Minute,
+		BaseCommand: "python3",
 	}
 }
 func (*testDistributedModel) GetTuningParameters() *model.PresetParam {

diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
@@ -34,13 +34,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.6
+    tag: 0.0.7
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.6
+    tag: 0.0.7
     # Tag history:
+    # 0.0.7 - Support vllm runtime
     # 0.0.6 - Add Logging & Metrics Server
     # 0.0.5 - Tuning and Adapters
     # 0.0.4 - Adjust default model params (#310)
@@ -51,13 +52,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
     # Tag history for 40b models:
+    # 0.0.8 - Support vllm runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Tuning and Adapters
     # 0.0.5 - Adjust default model params (#310)
@@ -71,13 +73,14 @@ models:
     type: text-generation 
     version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/c882233d224d27b727b3d9299b12a9aab9dda6f7
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
   - name: mistral-7b-instruct
     type: text-generation
     version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/0417f4babd26db0b5ed07c1d0bc85658ab526ea3
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
     # Tag history:
+    # 0.0.8 - Support vllm runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Update model version and Address missing weights files fix
     # 0.0.5 - Tuning and Adapters
@@ -91,8 +94,9 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
     runtime: tfs
-    tag: 0.0.5
+    tag: 0.0.6
     # Tag history:
+    # 0.0.6 - Support vllm runtime
     # 0.0.5 - Add Logging & Metrics Server
     # 0.0.4 - Tuning and Adapters
     # 0.0.3 - Adjust default model params (#310)
@@ -104,34 +108,38 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/d269012bea6fbe38ce7752c8940fea010eea3383
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support vllm runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
 
   - name: phi-3-mini-128k-instruct
     type: text-generation 
     version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/5be6479b4bc06a081e8f4c6ece294241ccd32dec
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support vllm runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
 
   - name: phi-3-medium-4k-instruct
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/d194e4e74ffad5a5e193e26af25bcfc80c7f1ffc
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support vllm runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
 
   - name: phi-3-medium-128k-instruct
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/cae1d42b5577398fd1be9f0746052562ae552886
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support vllm runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
@@ -19,21 +19,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 4  # Requesting 4 GPUs
           limits:
             nvidia.com/gpu: 4
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml
@@ -19,21 +19,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 4  # Requesting 4 GPUs
           limits:
             nvidia.com/gpu: 4
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

diff --git a/presets/test/manifests/falcon-7b-adapter/falcon-7b-adapter.yaml b/presets/test/manifests/falcon-7b-adapter/falcon-7b-adapter.yaml
@@ -30,21 +30,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
           limits:
             nvidia.com/gpu: 2  # Requesting 2 GPUs
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

diff --git a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
@@ -18,21 +18,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
           limits:
             nvidia.com/gpu: 2  # Requesting 2 GPUs
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

diff --git a/presets/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml b/presets/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml
@@ -29,7 +29,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2

diff --git a/presets/test/manifests/falcon-7b/falcon-7b.yaml b/presets/test/manifests/falcon-7b/falcon-7b.yaml
@@ -18,21 +18,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
           limits:
             nvidia.com/gpu: 2  # Requesting 2 GPUs
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

diff --git a/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
@@ -35,21 +35,21 @@ spec:
             - |
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
-              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 /workspace/tfs/inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

diff --git a/presets/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/test/manifests/llama-2-13b/llama-2-13b.yaml
@@ -35,21 +35,21 @@ spec:
             - |
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
-              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 /workspace/tfs/inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

diff --git a/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
@@ -19,21 +19,21 @@ spec:
           command:
             - /bin/sh
             - -c
-            - cd /workspace/llama/llama-2 && torchrun inference_api.py
+            - cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

diff --git a/presets/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/test/manifests/llama-2-7b/llama-2-7b.yaml
@@ -19,21 +19,21 @@ spec:
           command:
             - /bin/sh
             - -c
-            - cd /workspace/llama/llama-2 && torchrun inference_api.py
+            - cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10