Skip to content

Commit

Permalink
fix lmi/vllm virtual envs, update to vllm 0.7.1 (#2703)
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk authored Feb 3, 2025
1 parent b5e4ee9 commit 1d05281
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,11 @@
resolve_chat_template_content_format)


def is_chat_completions_request(inputs: Dict) -> bool:
return "messages" in inputs


def parse_chat_completions_request_vllm(
input_map: Dict,
is_rolling_batch: bool,
rolling_batch,
tokenizer,
chat_template: Optional[str] = None,
configs: Properties = None,
is_mistral_tokenizer: bool = False,
):
Expand All @@ -41,12 +36,6 @@ def parse_chat_completions_request_vllm(
"You must enable rolling batch to use the chat completions format."
)

if not is_mistral_tokenizer and not hasattr(tokenizer,
"apply_chat_template"):
raise AttributeError(
f"Cannot provide chat completion for tokenizer: {tokenizer.__class__}, "
f"please ensure that your tokenizer supports chat templates.")

tool_parser = rolling_batch.get_tool_parser()
chat_params = ChatProperties(**input_map)

Expand Down Expand Up @@ -85,16 +74,15 @@ def parse_chat_completions_request_vllm(
if is_mistral_tokenizer:
text_inputs = apply_mistral_chat_template(
tokenizer,
messages=chat_params.messages,
chat_template=chat_template,
add_generation_prompt=True,
chat_params.messages,
None,
tools=tool_dicts,
)
else:
text_inputs = apply_hf_chat_template(
tokenizer,
conversation=conversation,
chat_template=chat_template,
conversation,
None,
add_generation_prompt=True,
tools=tool_dicts,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class VllmRbProperties(Properties):
# The following configs have different defaults, or additional processing in DJL compared to vLLM
dtype: str = "auto"
max_loras: int = 4
task: str = 'auto'
# The following configs have broken processing in vllm via the FlexibleArgumentParser
long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
use_v2_block_manager: bool = True
Expand All @@ -89,6 +90,14 @@ def validate_engine(cls, engine):
f"Need python engine to start vLLM RollingBatcher")
return engine

@field_validator('task')
def validate_task(cls, task):
# TODO: conflicts between HF and VLLM tasks, need to separate these.
# for backwards compatibility, max text-generation to generate
if task == 'text-generation':
task = 'generate'
return task

@field_validator('dtype')
def validate_dtype(cls, val):
if val not in DTYPE_MAPPER:
Expand All @@ -114,6 +123,7 @@ def validate_tool_call_parser(self):
raise ValueError(
f"Invalid tool call parser: {self.tool_call_parser} "
f"(chose from {{ {','.join(valid_tool_parses)} }})")
return self

@field_validator('override_neuron_config', mode="before")
def validate_override_neuron_config(cls, val):
Expand Down
9 changes: 4 additions & 5 deletions serving/docker/lmi-container-requirements-common.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
peft==0.13.2
peft
protobuf==3.20.3
transformers==4.45.2
transformers>=4.45.2
hf-transfer
zstandard
datasets==3.0.1
Expand All @@ -23,9 +23,8 @@ onnx
sentence_transformers
onnxruntime-gpu==1.20.0
autoawq==0.2.5
llmcompressor==0.3.1
tokenizers==0.20.3
pydantic==2.9.2
tokenizers>=0.20.3
pydantic>=2.9.2
optimum==1.23.2
torch==2.5.1
torchvision==0.20.1
Expand Down
1 change: 1 addition & 0 deletions serving/docker/requirements-lmi.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-r requirements-common.txt
llmcompressor
# flash infer kernels for vllm/lmi-dist
https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
# vllm wheel built with pt2.5.1
Expand Down
3 changes: 2 additions & 1 deletion serving/docker/requirements-vllm.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
-r requirements-common.txt
vllm==0.7.0
llmcompressor
vllm==0.7.1
8 changes: 1 addition & 7 deletions serving/docker/scripts/create_virtual_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@ requirements_file=$2
# This was copied over from the previous pip install defined in the lmi.Dockerfile, so it's specific to that Dockerfile
python -m venv --system-site-packages $venv_directory
venv_pip="${venv_directory}/bin/pip"
$venv_pip install -r $requirements_file
$venv_pip install -r $requirements_file || exit 1
$venv_pip install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps
git clone https://github.com/neuralmagic/AutoFP8.git
cd AutoFP8
git reset --hard 4b2092c
$venv_pip install .
cd ..
rm -rf AutoFP8
$venv_pip cache purge
5 changes: 5 additions & 0 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,11 @@ def get_model_name():
"seq_length": [256],
"tokenizer": "TheBloke/Llama-2-7B-Chat-fp16"
},
"mistral-7b": {
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "TheBloke/Llama-2-7B-Chat-fp16",
}
}

vllm_tool_model_spec = {
Expand Down
1 change: 1 addition & 0 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ def test_mistral_7b(self):
prepare.build_vllm_model("mistral-7b")
r.launch()
client.run("vllm mistral-7b".split())
client.run("vllm_chat mistral-7b".split())

def test_phi2(self):
with Runner('lmi', 'phi-2') as r:
Expand Down

0 comments on commit 1d05281

Please sign in to comment.