Skip to content

Commit

Permalink
feat: rrelease a new preset version that supports vllm
Browse files Browse the repository at this point in the history
Signed-off-by: jerryzhuang <[email protected]>
  • Loading branch information
zhuangqh committed Nov 4, 2024
1 parent 5f2f649 commit 2711ee9
Show file tree
Hide file tree
Showing 23 changed files with 81 additions and 71 deletions.
4 changes: 2 additions & 2 deletions pkg/inference/preset-inferences.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ import (
)

const (
ProbePath = "/healthz"
ProbePath = "/health"
Port5000 = int32(5000)
InferenceFile = "inference_api.py"
InferenceFile = "/workspace/tfs/inference_api.py"
)

var (
Expand Down
6 changes: 3 additions & 3 deletions pkg/inference/preset-inferences_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TestCreatePresetInference(t *testing.T) {
workload: "Deployment",
// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
// So expected cmd consists of shell command and inference file
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
hasAdapters: false,
},

Expand All @@ -58,7 +58,7 @@ func TestCreatePresetInference(t *testing.T) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
},
workload: "StatefulSet",
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
hasAdapters: false,
},

Expand All @@ -69,7 +69,7 @@ func TestCreatePresetInference(t *testing.T) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
},
workload: "Deployment",
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
hasAdapters: true,
expectedVolume: "adapter-volume",
},
Expand Down
2 changes: 2 additions & 0 deletions pkg/utils/test/testModel.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ func (*testModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: "python3",
}
}
func (*testModel) GetTuningParameters() *model.PresetParam {
Expand All @@ -37,6 +38,7 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: "python3",
}
}
func (*testDistributedModel) GetTuningParameters() *model.PresetParam {
Expand Down
30 changes: 19 additions & 11 deletions presets/models/supported_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,14 @@ models:
type: text-generation
version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
runtime: tfs
tag: 0.0.6
tag: 0.0.7
- name: falcon-7b-instruct
type: text-generation
version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
runtime: tfs
tag: 0.0.6
tag: 0.0.7
# Tag history:
# 0.0.7 - Support vllm runtime
# 0.0.6 - Add Logging & Metrics Server
# 0.0.5 - Tuning and Adapters
# 0.0.4 - Adjust default model params (#310)
Expand All @@ -51,13 +52,14 @@ models:
type: text-generation
version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
runtime: tfs
tag: 0.0.7
tag: 0.0.8
- name: falcon-40b-instruct
type: text-generation
version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
runtime: tfs
tag: 0.0.7
tag: 0.0.8
# Tag history for 40b models:
# 0.0.8 - Support vllm runtime
# 0.0.7 - Add Logging & Metrics Server
# 0.0.6 - Tuning and Adapters
# 0.0.5 - Adjust default model params (#310)
Expand All @@ -71,13 +73,14 @@ models:
type: text-generation
version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/c882233d224d27b727b3d9299b12a9aab9dda6f7
runtime: tfs
tag: 0.0.7
tag: 0.0.8
- name: mistral-7b-instruct
type: text-generation
version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/0417f4babd26db0b5ed07c1d0bc85658ab526ea3
runtime: tfs
tag: 0.0.7
tag: 0.0.8
# Tag history:
# 0.0.8 - Support vllm runtime
# 0.0.7 - Add Logging & Metrics Server
# 0.0.6 - Update model version and Address missing weights files fix
# 0.0.5 - Tuning and Adapters
Expand All @@ -91,8 +94,9 @@ models:
type: text-generation
version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
runtime: tfs
tag: 0.0.5
tag: 0.0.6
# Tag history:
# 0.0.6 - Support vllm runtime
# 0.0.5 - Add Logging & Metrics Server
# 0.0.4 - Tuning and Adapters
# 0.0.3 - Adjust default model params (#310)
Expand All @@ -104,34 +108,38 @@ models:
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/d269012bea6fbe38ce7752c8940fea010eea3383
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support vllm runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release

- name: phi-3-mini-128k-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/5be6479b4bc06a081e8f4c6ece294241ccd32dec
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support vllm runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release

- name: phi-3-medium-4k-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/d194e4e74ffad5a5e193e26af25bcfc80c7f1ffc
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support vllm runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release

- name: phi-3-medium-128k-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/cae1d42b5577398fd1be9f0746052562ae552886
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support vllm runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 4 # Requesting 4 GPUs
limits:
nvidia.com/gpu: 4
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/falcon-40b/falcon-40b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 4 # Requesting 4 GPUs
limits:
nvidia.com/gpu: 4
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/falcon-7b/falcon-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,21 @@ spec:
- |
echo "MASTER_ADDR: $MASTER_ADDR"
NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 /workspace/tfs/inference_api.py
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/llama-2-13b/llama-2-13b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,21 @@ spec:
- |
echo "MASTER_ADDR: $MASTER_ADDR"
NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 /workspace/tfs/inference_api.py
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- cd /workspace/llama/llama-2 && torchrun inference_api.py
- cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/llama-2-7b/llama-2-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- cd /workspace/llama/llama-2 && torchrun inference_api.py
- cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Loading

0 comments on commit 2711ee9

Please sign in to comment.