diff --git a/clients/python/llmengine/__init__.py b/clients/python/llmengine/__init__.py index 17dacfa9..dfae78cf 100644 --- a/clients/python/llmengine/__init__.py +++ b/clients/python/llmengine/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.0.0b27" +__version__ = "0.0.0b28" import os from typing import Sequence diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 43d0813c..0181b733 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -1,4 +1,4 @@ -from typing import AsyncIterable, Iterator, List, Optional, Union +from typing import Any, AsyncIterable, Dict, Iterator, List, Optional, Union from llmengine.api_engine import APIEngine from llmengine.data_types import ( @@ -43,6 +43,10 @@ async def acreate( frequency_penalty: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, + include_stop_str_in_output: Optional[bool] = False, + guided_json: Optional[Dict[str, Any]] = None, + guided_regex: Optional[str] = None, + guided_choice: Optional[List[str]] = None, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]: @@ -102,6 +106,18 @@ async def acreate( Float that controls the cumulative probability of the top tokens to consider. Range: (0.0, 1.0]. 1.0 means consider all tokens. + include_stop_str_in_output (Optional[bool]): + Whether to include the stop sequence in the output. Default to False. + + guided_json (Optional[Dict[str, Any]]): + If specified, the output will follow the JSON schema. For examples see https://json-schema.org/learn/miscellaneous-examples. + + guided_regex (Optional[str]): + If specified, the output will follow the regex pattern. + + guided_choice (Optional[List[str]]): + If specified, the output will be exactly one of the choices. + timeout (int): Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. @@ -198,6 +214,10 @@ async def _acreate_stream( frequency_penalty=frequency_penalty, top_k=top_k, top_p=top_p, + include_stop_str_in_output=include_stop_str_in_output, + guided_json=guided_json, + guided_regex=guided_regex, + guided_choice=guided_choice, timeout=timeout, ) @@ -237,6 +257,10 @@ def create( frequency_penalty: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, + include_stop_str_in_output: Optional[bool] = False, + guided_json: Optional[Dict[str, Any]] = None, + guided_regex: Optional[str] = None, + guided_choice: Optional[List[str]] = None, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]: @@ -297,6 +321,18 @@ def create( Float that controls the cumulative probability of the top tokens to consider. Range: (0.0, 1.0]. 1.0 means consider all tokens. + include_stop_str_in_output (Optional[bool]): + Whether to include the stop sequence in the output. Default to False. + + guided_json (Optional[Dict[str, Any]]): + If specified, the output will follow the JSON schema. + + guided_regex (Optional[str]): + If specified, the output will follow the regex pattern. + + guided_choice (Optional[List[str]]): + If specified, the output will be exactly one of the choices. + timeout (int): Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. @@ -396,6 +432,10 @@ def _create_stream(**kwargs): frequency_penalty=frequency_penalty, top_k=top_k, top_p=top_p, + include_stop_str_in_output=include_stop_str_in_output, + guided_json=guided_json, + guided_regex=guided_regex, + guided_choice=guided_choice, ).dict() response = cls.post_sync( resource_name=f"v1/llm/completions-sync?model_endpoint_name={model}", diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py index 70abd6cb..a3ed3209 100644 --- a/clients/python/llmengine/data_types.py +++ b/clients/python/llmengine/data_types.py @@ -279,6 +279,10 @@ class CompletionSyncV1Request(BaseModel): frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_k: Optional[int] = Field(default=None, ge=-1) top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) + include_stop_str_in_output: Optional[bool] = Field(default=False) + guided_json: Optional[Dict[str, Any]] = Field(default=None) + guided_regex: Optional[str] = Field(default=None) + guided_choice: Optional[List[str]] = Field(default=None) class TokenOutput(BaseModel): @@ -349,6 +353,10 @@ class CompletionStreamV1Request(BaseModel): frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_k: Optional[int] = Field(default=None, ge=-1) top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) + include_stop_str_in_output: Optional[bool] = Field(default=False) + guided_json: Optional[Dict[str, Any]] = Field(default=None) + guided_regex: Optional[str] = Field(default=None) + guided_choice: Optional[List[str]] = Field(default=None) class CompletionStreamOutput(BaseModel): diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 2563b814..8ddec08f 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scale-llm-engine" -version = "0.0.0.beta27" +version = "0.0.0.beta28" description = "Scale LLM Engine Python client" license = "Apache-2.0" authors = ["Phil Chen "] diff --git a/clients/python/setup.py b/clients/python/setup.py index 257516fc..a33e6a03 100644 --- a/clients/python/setup.py +++ b/clients/python/setup.py @@ -3,6 +3,6 @@ setup( name="scale-llm-engine", python_requires=">=3.7", - version="0.0.0.beta27", + version="0.0.0.beta28", packages=find_packages(), ) diff --git a/docs/guides/completions.md b/docs/guides/completions.md index 69dfe1bd..86bb9f0b 100644 --- a/docs/guides/completions.md +++ b/docs/guides/completions.md @@ -193,6 +193,59 @@ response = Completion.batch_create( print(response.json()) ``` +## Guided decoding + +Guided decoding is supported by vLLM and backed by [Outlines](https://github.com/outlines-dev/outlines). +It enforces certain token generation patterns by tinkering with the sampling logits. + +=== "Guided decoding with regex" +```python +from llmengine import Completion + +response = Completion.create( + model="llama-2-7b", + prompt="Hello, my name is", + max_new_tokens=10, + temperature=0.2, + guided_regex="Sean.*", +) + +print(response.json()) +# {"request_id":"c19f0fae-317e-4f69-8e06-c04189299b9c","output":{"text":"Sean. I'm a 2","num_prompt_tokens":6,"num_completion_tokens":10,"tokens":null}} +``` + +=== "Guided decoding with choice" +```python +from llmengine import Completion + +response = Completion.create( + model="llama-2-7b", + prompt="Hello, my name is", + max_new_tokens=10, + temperature=0.2, + guided_choice=["Sean", "Brian", "Tim"], +) + +print(response.json()) +# {"request_id":"641e2af3-a3e3-4493-98b9-d38115ba0d22","output":{"text":"Sean","num_prompt_tokens":6,"num_completion_tokens":4,"tokens":null}} +``` + +=== "Guided decoding with JSON schema" +```python +from llmengine import Completion + +response = Completion.create( + model="llama-2-7b", + prompt="Hello, my name is", + max_new_tokens=10, + temperature=0.2, + guided_json={"properties":{"myString":{"type":"string"}},"required":["myString"]}, +) + +print(response.json()) +# {"request_id":"5b184654-96b6-4932-9eb6-382a51fdb3d5","output":{"text":"{\"myString\" : \"John Doe","num_prompt_tokens":6,"num_completion_tokens":10,"tokens":null}} +``` + ## Which model should I use? See the [Model Zoo](../../model_zoo) for more information on best practices for which model to use for Completions. diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py index 8d335d8d..9fb8ed1d 100644 --- a/model-engine/model_engine_server/common/dtos/llms.py +++ b/model-engine/model_engine_server/common/dtos/llms.py @@ -184,6 +184,18 @@ class CompletionSyncV1Request(BaseModel): """ Whether to include the stop strings in output text. """ + guided_json: Optional[Dict[str, Any]] = None + """ + JSON schema for guided decoding. + """ + guided_regex: Optional[str] = None + """ + Regex for guided decoding. + """ + guided_choice: Optional[List[str]] = None + """ + Choices for guided decoding. + """ class TokenOutput(BaseModel): @@ -248,6 +260,18 @@ class CompletionStreamV1Request(BaseModel): """ Whether to include the stop strings in output text. """ + guided_json: Optional[Dict[str, Any]] = None + """ + JSON schema for guided decoding. Only supported in vllm. + """ + guided_regex: Optional[str] = None + """ + Regex for guided decoding. Only supported in vllm. + """ + guided_choice: Optional[List[str]] = None + """ + Choices for guided decoding. Only supported in vllm. + """ class CompletionStreamOutput(BaseModel): diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index b458343c..65973ced 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -1365,6 +1365,26 @@ def validate_and_update_completion_params( "include_stop_str_in_output is only supported in vllm." ) + guided_count = 0 + if request.guided_choice is not None: + guided_count += 1 + if request.guided_json is not None: + guided_count += 1 + if request.guided_regex is not None: + guided_count += 1 + + if guided_count > 1: + raise ObjectHasInvalidValueException( + "Only one of guided_json, guided_choice, guided_regex can be enabled." + ) + + if ( + request.guided_choice is not None + or request.guided_regex is not None + or request.guided_json is not None + ) and not inference_framework == LLMInferenceFramework.VLLM: + raise ObjectHasInvalidValueException("Guided decoding is only supported in vllm.") + return request @@ -1656,6 +1676,12 @@ async def execute( vllm_args["logprobs"] = 1 if request.include_stop_str_in_output is not None: vllm_args["include_stop_str_in_output"] = request.include_stop_str_in_output + if request.guided_choice is not None: + vllm_args["guided_choice"] = request.guided_choice + if request.guided_regex is not None: + vllm_args["guided_regex"] = request.guided_regex + if request.guided_json is not None: + vllm_args["guided_json"] = request.guided_json inference_request = SyncEndpointPredictV1Request( args=vllm_args, @@ -1918,6 +1944,12 @@ async def execute( args["logprobs"] = 1 if request.include_stop_str_in_output is not None: args["include_stop_str_in_output"] = request.include_stop_str_in_output + if request.guided_choice is not None: + args["guided_choice"] = request.guided_choice + if request.guided_regex is not None: + args["guided_regex"] = request.guided_regex + if request.guided_json is not None: + args["guided_json"] = request.guided_json args["stream"] = True elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM: args = { diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt index 78e033bb..3c1cf851 100644 --- a/model-engine/model_engine_server/inference/vllm/requirements.txt +++ b/model-engine/model_engine_server/inference/vllm/requirements.txt @@ -1,3 +1,2 @@ -ray>=2.9 -vllm==0.3.2 +vllm==0.3.3 pydantic>=2.0 diff --git a/model-engine/model_engine_server/inference/vllm/vllm_server.py b/model-engine/model_engine_server/inference/vllm/vllm_server.py index 5bd3f6e4..c4dd0eed 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_server.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_server.py @@ -7,10 +7,12 @@ from typing import AsyncGenerator import uvicorn -from fastapi import BackgroundTasks, FastAPI, Request +from fastapi import BackgroundTasks, FastAPI, HTTPException, Request from fastapi.responses import Response, StreamingResponse from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.openai.protocol import CompletionRequest as OpenAICompletionRequest +from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor from vllm.sampling_params import SamplingParams from vllm.utils import random_uuid @@ -38,7 +40,35 @@ async def generate(request: Request) -> Response: request_dict = await request.json() prompt = request_dict.pop("prompt") stream = request_dict.pop("stream", False) + guided_json = request_dict.pop("guided_json", None) + guided_regex = request_dict.pop("guided_regex", None) + guided_choice = request_dict.pop("guided_choice", None) sampling_params = SamplingParams(**request_dict) + + # Dummy request to get guided decode logit processor + try: + partial_openai_request = OpenAICompletionRequest.model_validate( + { + "model": "", + "prompt": "", + "guided_json": guided_json, + "guided_regex": guided_regex, + "guided_choice": guided_choice, + } + ) + except Exception: + raise HTTPException( + status_code=400, detail="Bad request: failed to parse guided decoding parameters." + ) + + guided_decode_logit_processor = await get_guided_decoding_logits_processor( + partial_openai_request, engine.get_tokenizer() + ) + if guided_decode_logit_processor is not None: + if sampling_params.logits_processors is None: + sampling_params.logits_processors = [] + sampling_params.logits_processors.append(guided_decode_logit_processor) + request_id = random_uuid() results_generator = engine.generate(prompt, sampling_params, request_id) diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py index 61473b37..4b57afa1 100644 --- a/model-engine/tests/unit/conftest.py +++ b/model-engine/tests/unit/conftest.py @@ -3725,6 +3725,138 @@ def llm_model_endpoint_sync( return model_endpoint, model_endpoint_json +@pytest.fixture +def llm_model_endpoint_stream( + test_api_key: str, model_bundle_1: ModelBundle +) -> Tuple[ModelEndpoint, Any]: + model_endpoint = ModelEndpoint( + record=ModelEndpointRecord( + id="test_llm_model_endpoint_id_2", + name="test_llm_model_endpoint_name_1", + created_by=test_api_key, + created_at=datetime(2022, 1, 3), + last_updated_at=datetime(2022, 1, 3), + metadata={ + "_llm": { + "model_name": "llama-7b", + "source": "hugging_face", + "inference_framework": "vllm", + "inference_framework_image_tag": "123", + "num_shards": 4, + } + }, + creation_task_id="test_creation_task_id", + endpoint_type=ModelEndpointType.STREAMING, + destination="test_destination", + status=ModelEndpointStatus.READY, + current_model_bundle=model_bundle_1, + owner=test_api_key, + public_inference=True, + ), + infra_state=ModelEndpointInfraState( + deployment_name=f"{test_api_key}-test_llm_model_endpoint_name_1", + aws_role="test_aws_role", + results_s3_bucket="test_s3_bucket", + child_fn_info=None, + labels={}, + prewarm=True, + high_priority=False, + deployment_state=ModelEndpointDeploymentState( + min_workers=1, + max_workers=3, + per_worker=2, + available_workers=1, + unavailable_workers=1, + ), + resource_state=ModelEndpointResourceState( + cpus=1, + gpus=1, + memory="1G", + gpu_type=GpuType.NVIDIA_TESLA_T4, + storage="10G", + optimize_costs=True, + ), + user_config_state=ModelEndpointUserConfigState( + app_config=model_bundle_1.app_config, + endpoint_config=ModelEndpointConfig( + bundle_name=model_bundle_1.name, + endpoint_name="test_llm_model_endpoint_name_1", + post_inference_hooks=["callback"], + default_callback_url="http://www.example.com", + default_callback_auth=CallbackAuth( + __root__=CallbackBasicAuth( + kind="basic", + username="test_username", + password="test_password", + ), + ), + ), + ), + num_queued_items=1, + image="test_image", + ), + ) + model_endpoint_json: Dict[str, Any] = { + "id": "test_llm_model_endpoint_id_2", + "name": "test_llm_model_endpoint_name_1", + "model_name": "llama-7b", + "source": "hugging_face", + "status": "READY", + "inference_framework": "vllm", + "inference_framework_image_tag": "123", + "num_shards": 4, + "spec": { + "id": "test_llm_model_endpoint_id_2", + "name": "test_llm_model_endpoint_name_1", + "endpoint_type": "streaming", + "destination": "test_destination", + "deployment_name": f"{test_api_key}-test_llm_model_endpoint_name_1", + "metadata": { + "_llm": { + "model_name": "llama-7b", + "source": "hugging_face", + "inference_framework": "vllm", + "inference_framework_image_tag": "123", + "num_shards": 4, + } + }, + "bundle_name": "test_model_bundle_name_1", + "status": "READY", + "post_inference_hooks": ["callback"], + "default_callback_url": "http://www.example.com", + "default_callback_auth": { + "kind": "basic", + "username": "test_username", + "password": "test_password", + }, + "labels": {}, + "aws_role": "test_aws_role", + "results_s3_bucket": "test_s3_bucket", + "created_by": test_api_key, + "created_at": "2022-01-03T00:00:00", + "last_updated_at": "2022-01-03T00:00:00", + "deployment_state": { + "min_workers": 1, + "max_workers": 3, + "per_worker": 2, + "available_workers": 1, + "unavailable_workers": 1, + }, + "resource_state": { + "cpus": "1", + "gpus": 1, + "memory": "1G", + "gpu_type": "nvidia-tesla-t4", + "storage": "10G", + "optimize_costs": True, + }, + "num_queued_items": 1, + "public_inference": True, + }, + } + return model_endpoint, model_endpoint_json + + @pytest.fixture def llm_model_endpoint_sync_tgi( test_api_key: str, model_bundle_1: ModelBundle diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py index 10b37c7d..b2496ff9 100644 --- a/model-engine/tests/unit/domain/test_llm_use_cases.py +++ b/model-engine/tests/unit/domain/test_llm_use_cases.py @@ -51,6 +51,7 @@ UpdateLLMModelEndpointV1UseCase, _include_safetensors_bin_or_pt, infer_hardware_from_model_name, + validate_and_update_completion_params, ) from model_engine_server.domain.use_cases.model_bundle_use_cases import CreateModelBundleV2UseCase @@ -602,6 +603,8 @@ async def test_completion_sync_use_case_success( llm_model_endpoint_sync: Tuple[ModelEndpoint, Any], completion_sync_request: CompletionSyncV1Request, ): + completion_sync_request.include_stop_str_in_output = True + completion_sync_request.guided_json = {} fake_llm_model_endpoint_service.add_model_endpoint(llm_model_endpoint_sync[0]) fake_model_endpoint_service.sync_model_endpoint_inference_gateway.response = ( SyncEndpointPredictV1Response( @@ -987,6 +990,42 @@ async def test_completion_sync_use_case_not_sync_endpoint_raises( ) +@pytest.mark.asyncio +async def test_validate_and_update_completion_params(): + completion_sync_request = CompletionSyncV1Request( + prompt="What is machine learning?", + max_new_tokens=10, + temperature=0.5, + return_token_log_probs=True, + ) + + validate_and_update_completion_params(LLMInferenceFramework.VLLM, completion_sync_request) + + validate_and_update_completion_params( + LLMInferenceFramework.TEXT_GENERATION_INFERENCE, completion_sync_request + ) + + completion_sync_request.include_stop_str_in_output = True + with pytest.raises(ObjectHasInvalidValueException): + validate_and_update_completion_params( + LLMInferenceFramework.TEXT_GENERATION_INFERENCE, completion_sync_request + ) + completion_sync_request.include_stop_str_in_output = None + + completion_sync_request.guided_regex = "" + completion_sync_request.guided_json = {} + completion_sync_request.guided_choice = [""] + with pytest.raises(ObjectHasInvalidValueException): + validate_and_update_completion_params(LLMInferenceFramework.VLLM, completion_sync_request) + + completion_sync_request.guided_regex = None + completion_sync_request.guided_choice = None + with pytest.raises(ObjectHasInvalidValueException): + validate_and_update_completion_params( + LLMInferenceFramework.TEXT_GENERATION_INFERENCE, completion_sync_request + ) + + @pytest.mark.asyncio @mock.patch( "model_engine_server.domain.use_cases.llm_model_endpoint_use_cases.count_tokens", @@ -1079,6 +1118,116 @@ async def test_completion_stream_use_case_success( i += 1 +@pytest.mark.asyncio +@mock.patch( + "model_engine_server.domain.use_cases.llm_model_endpoint_use_cases.count_tokens", + return_value=7, +) +async def test_completion_stream_vllm_use_case_success( + test_api_key: str, + fake_model_endpoint_service, + fake_llm_model_endpoint_service, + fake_tokenizer_repository, + llm_model_endpoint_stream: Tuple[ModelEndpoint, Any], + completion_stream_request: CompletionStreamV1Request, +): + completion_stream_request.guided_json = {} + fake_llm_model_endpoint_service.add_model_endpoint(llm_model_endpoint_stream[0]) + fake_model_endpoint_service.streaming_model_endpoint_inference_gateway.responses = [ + SyncEndpointPredictV1Response( + status=TaskStatus.SUCCESS, + result={ + "result": { + "text": "I", + "finished": False, + "count_prompt_tokens": 7, + "count_output_tokens": 1, + } + }, + traceback=None, + ), + SyncEndpointPredictV1Response( + status=TaskStatus.SUCCESS, + result={ + "result": { + "text": " am", + "finished": False, + "count_prompt_tokens": 7, + "count_output_tokens": 2, + } + }, + traceback=None, + ), + SyncEndpointPredictV1Response( + status=TaskStatus.SUCCESS, + result={ + "result": { + "text": " a", + "finished": False, + "count_prompt_tokens": 7, + "count_output_tokens": 3, + } + }, + traceback=None, + ), + SyncEndpointPredictV1Response( + status=TaskStatus.SUCCESS, + result={ + "result": { + "text": " new", + "finished": False, + "count_prompt_tokens": 7, + "count_output_tokens": 4, + } + }, + traceback=None, + ), + SyncEndpointPredictV1Response( + status=TaskStatus.SUCCESS, + result={ + "result": { + "text": "bie", + "finished": False, + "count_prompt_tokens": 7, + "count_output_tokens": 5, + } + }, + traceback=None, + ), + SyncEndpointPredictV1Response( + status=TaskStatus.SUCCESS, + result={ + "result": { + "text": ".", + "finished": True, + "count_prompt_tokens": 7, + "count_output_tokens": 6, + } + }, + traceback=None, + ), + ] + use_case = CompletionStreamV1UseCase( + model_endpoint_service=fake_model_endpoint_service, + llm_model_endpoint_service=fake_llm_model_endpoint_service, + tokenizer_repository=fake_tokenizer_repository, + ) + user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) + response_1 = use_case.execute( + user=user, + model_endpoint_name=llm_model_endpoint_stream[0].record.name, + request=completion_stream_request, + ) + output_texts = ["I", " am", " a", " new", "bie", ".", "I am a newbie."] + i = 0 + async for message in response_1: + assert message.dict()["output"]["text"] == output_texts[i] + if i == 5: + assert message.dict()["output"]["num_prompt_tokens"] == 7 + assert message.dict()["output"]["num_completion_tokens"] == 6 + i += 1 + + @pytest.mark.asyncio @mock.patch( "model_engine_server.domain.use_cases.llm_model_endpoint_use_cases.count_tokens",