Skip to content

Commit

Permalink
feat: support adaptive max_model_len (#657)
Browse files Browse the repository at this point in the history
**Reason for Change**:
<!-- What does this PR improve or fix in Kaito? Why is it needed? -->

- upgrade base image to python-3.12
- added new model: phi-3.5-mini
- support adaptive `max_model_len`

---------

Signed-off-by: jerryzhuang <[email protected]>
Co-authored-by: Fei Guo <[email protected]>
  • Loading branch information
zhuangqh and Fei-Guo authored Nov 12, 2024
1 parent 5812927 commit 1c6eb2e
Show file tree
Hide file tree
Showing 25 changed files with 165 additions and 64 deletions.
2 changes: 1 addition & 1 deletion docker/presets/models/tfs/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-slim@sha256:684b1aaf96a7942b3c3af438d162e0baa3510aa7af25ad76d238e0c746bdec79
FROM python:3.12-slim

ARG WEIGHTS_PATH
ARG MODEL_TYPE
Expand Down
2 changes: 2 additions & 0 deletions pkg/utils/test/testModel.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ func (*testModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: "python3",
}
}
func (*testModel) GetTuningParameters() *model.PresetParam {
Expand All @@ -37,6 +38,7 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: "python3",
}
}
func (*testDistributedModel) GetTuningParameters() *model.PresetParam {
Expand Down
6 changes: 3 additions & 3 deletions pkg/workspace/inference/preset-inferences_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TestCreatePresetInference(t *testing.T) {
workload: "Deployment",
// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
// So expected cmd consists of shell command and inference file
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 inference_api.py",
hasAdapters: false,
},

Expand All @@ -58,7 +58,7 @@ func TestCreatePresetInference(t *testing.T) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
},
workload: "StatefulSet",
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 inference_api.py",
hasAdapters: false,
},

Expand All @@ -69,7 +69,7 @@ func TestCreatePresetInference(t *testing.T) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
},
workload: "Deployment",
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 inference_api.py",
hasAdapters: true,
expectedVolume: "adapter-volume",
},
Expand Down
62 changes: 58 additions & 4 deletions presets/inference/vllm/inference_api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
import gc
import os

import uvloop
import torch
from vllm.utils import FlexibleArgumentParser
import vllm.entrypoints.openai.api_server as api_server
from vllm.engine.llm_engine import (LLMEngine, EngineArgs, EngineConfig)

# Initialize logger
logger = logging.getLogger(__name__)
Expand All @@ -26,22 +29,73 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
# See https://docs.vllm.ai/en/latest/models/engine_args.html for more args
engine_default_args = {
"model": "/workspace/vllm/weights",
"cpu-offload-gb": 0,
"gpu-memory-utilization": 0.9,
"swap-space": 4,
"disable-log-stats": False,
"cpu_offload_gb": 0,
"gpu_memory_utilization": 0.95,
"swap_space": 4,
"disable_log_stats": False,
"uvicorn_log_level": "error"
}
parser.set_defaults(**engine_default_args)

return parser

def find_max_available_seq_len(engine_config: EngineConfig) -> int:
"""
Load model and run profiler to find max available seq len.
"""
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L335
executor_class = LLMEngine._get_executor_cls(engine_config)
executor = executor_class(
model_config=engine_config.model_config,
cache_config=engine_config.cache_config,
parallel_config=engine_config.parallel_config,
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
lora_config=engine_config.lora_config,
speculative_config=engine_config.speculative_config,
load_config=engine_config.load_config,
prompt_adapter_config=engine_config.prompt_adapter_config,
observability_config=engine_config.observability_config,
)

# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L477
num_gpu_blocks, _ = executor.determine_num_available_blocks()

# release memory
del executor
gc.collect()
torch.cuda.empty_cache()

return engine_config.cache_config.block_size * num_gpu_blocks

if __name__ == "__main__":
parser = FlexibleArgumentParser(description='vLLM serving server')
parser = api_server.make_arg_parser(parser)
parser = make_arg_parser(parser)
args = parser.parse_args()

if args.max_model_len is None:
engine_args = EngineArgs.from_cli_args(args)
# read the model config from hf weights path.
# vllm will perform different parser for different model architectures
# and read it into a unified EngineConfig.
engine_config = engine_args.create_engine_config()

logger.info("Try run profiler to find max available seq len")
available_seq_len = find_max_available_seq_len(engine_config)
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/worker/worker.py#L262
if available_seq_len <= 0:
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine.")
max_model_len = engine_config.model_config.max_model_len
if available_seq_len > max_model_len:
available_seq_len = max_model_len

if available_seq_len != max_model_len:
logger.info(f"Set max_model_len from {max_model_len} to {available_seq_len}")
args.max_model_len = available_seq_len

# Run the serving server
logger.info(f"Starting server on port {args.port}")
# See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html for more
Expand Down
45 changes: 45 additions & 0 deletions presets/models/phi3/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,25 @@ func init() {
Name: PresetPhi3Medium128kModel,
Instance: &phi3MediumB,
})
plugin.KaitoModelRegister.Register(&plugin.Registration{
Name: PresetPhi3_5MiniInstruct,
Instance: &phi3_5MiniC,
})
}

var (
PresetPhi3Mini4kModel = "phi-3-mini-4k-instruct"
PresetPhi3Mini128kModel = "phi-3-mini-128k-instruct"
PresetPhi3Medium4kModel = "phi-3-medium-4k-instruct"
PresetPhi3Medium128kModel = "phi-3-medium-128k-instruct"
PresetPhi3_5MiniInstruct = "phi-3.5-mini-instruct"

PresetPhiTagMap = map[string]string{
"Phi3Mini4kInstruct": "0.0.2",
"Phi3Mini128kInstruct": "0.0.2",
"Phi3Medium4kInstruct": "0.0.2",
"Phi3Medium128kInstruct": "0.0.2",
"Phi3_5MiniInstruct": "0.0.1",
}

baseCommandPresetPhiInference = "accelerate launch"
Expand Down Expand Up @@ -130,6 +136,45 @@ func (*phi3Mini128KInst) SupportTuning() bool {
return true
}

var phi3_5MiniC phi3_5MiniInst

type phi3_5MiniInst struct{}

func (*phi3_5MiniInst) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
ModelFamilyName: "Phi3_5",
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "8Gi",
PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
TorchRunParams: inference.DefaultAccelerateParams,
ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetPhiInference,
Tag: PresetPhiTagMap["Phi3_5MiniInstruct"],
}
}
func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam {
return &model.PresetParam{
ModelFamilyName: "Phi3_5",
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "72Gi",
PerGPUMemoryRequirement: "72Gi",
// TorchRunParams: inference.DefaultAccelerateParams,
// ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetPhiTuning,
Tag: PresetPhiTagMap["Phi3_5MiniInstruct"],
}
}
func (*phi3_5MiniInst) SupportDistributedInference() bool { return false }
func (*phi3_5MiniInst) SupportTuning() bool {
return true
}

var phi3MediumA Phi3Medium4kInstruct

type Phi3Medium4kInstruct struct{}
Expand Down
2 changes: 1 addition & 1 deletion presets/models/supported_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,4 @@ models:
tag: 0.0.2
# Tag history:
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release
# 0.0.1 - Initial Release
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 4 # Requesting 4 GPUs
limits:
nvidia.com/gpu: 4
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/falcon-40b/falcon-40b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 4 # Requesting 4 GPUs
limits:
nvidia.com/gpu: 4
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/falcon-7b/falcon-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,21 @@ spec:
- |
echo "MASTER_ADDR: $MASTER_ADDR"
NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 /workspace/tfs/inference_api.py
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Loading

0 comments on commit 1c6eb2e

Please sign in to comment.