From 3cf87b5fa700abdc5021b689dced23c081417ea3 Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Wed, 4 Dec 2024 22:36:36 +1100 Subject: [PATCH 1/2] docs: update for multi-runtime support Signed-off-by: jerryzhuang --- README.md | 40 +++++++++++-------- charts/kaito/workspace/values.yaml | 1 + docs/inference/README.md | 28 ++++++++++++- .../kaito_workspace_phi_3.5-instruct.yaml | 12 ++++++ 4 files changed, 64 insertions(+), 17 deletions(-) create mode 100644 examples/inference/kaito_workspace_phi_3.5-instruct.yaml diff --git a/README.md b/README.md index adfdf069b..60a82b0b1 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,9 @@ Kaito is an operator that automates the AI/ML model inference or tuning workload The target models are popular open-sourced large models such as [falcon](https://huggingface.co/tiiuae) and [phi-3](https://huggingface.co/docs/transformers/main/en/model_doc/phi3). Kaito has the following key differentiations compared to most of the mainstream model deployment methodologies built on top of virtual machine infrastructures: -- Manage large model files using container images. A http server is provided to perform inference calls using the model library. +- Manage large model files using container images. An OpenAI-compatible server is provided to perform inference calls. - Provide preset configurations to avoid adjusting workload parameters based on GPU hardware. +- Provide support for popular open-sourced inference runtimes: [vLLM](https://github.com/vllm-project/vllm) and [transformers](https://github.com/huggingface/transformers). - Auto-provision GPU nodes based on model requirements. - Host large model images in the public Microsoft Container Registry (MCR) if the license allows. @@ -40,43 +41,50 @@ Please check the installation guidance [here](./docs/installation.md) for deploy ## Quick start -After installing Kaito, one can try following commands to start a falcon-7b inference service. +After installing Kaito, one can try following commands to start a phi-3.5-mini-instruct inference service. ```sh -$ cat examples/inference/kaito_workspace_falcon_7b.yaml +$ cat examples/inference/kaito_workspace_phi_3.5-instruct.yaml apiVersion: kaito.sh/v1alpha1 kind: Workspace metadata: - name: workspace-falcon-7b + name: workspace-phi-3-5-mini resource: instanceType: "Standard_NC12s_v3" labelSelector: matchLabels: - apps: falcon-7b + apps: phi-3-5 inference: preset: - name: "falcon-7b" + name: phi-3.5-mini-instruct -$ kubectl apply -f examples/inference/kaito_workspace_falcon_7b.yaml +$ kubectl apply -f examples/inference/kaito_workspace_phi_3.5-instruct.yaml ``` The workspace status can be tracked by running the following command. When the WORKSPACEREADY column becomes `True`, the model has been deployed successfully. ```sh -$ kubectl get workspace workspace-falcon-7b -NAME INSTANCE RESOURCEREADY INFERENCEREADY JOBSTARTED WORKSPACESUCCEEDED AGE -workspace-falcon-7b Standard_NC12s_v3 True True True True 10m +$ kubectl get workspace workspace-phi-3-5-mini +NAME INSTANCE RESOURCEREADY INFERENCEREADY JOBSTARTED WORKSPACESUCCEEDED AGE +workspace-phi-3-5-mini Standard_NC12s_v3 True True True 4h15m ``` Next, one can find the inference service's cluster ip and use a temporal `curl` pod to test the service endpoint in the cluster. ```sh -$ kubectl get svc workspace-falcon-7b -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -workspace-falcon-7b ClusterIP 80/TCP,29500/TCP 10m - -export CLUSTERIP=$(kubectl get svc workspace-falcon-7b -o jsonpath="{.spec.clusterIPs[0]}") -$ kubectl run -it --rm --restart=Never curl --image=curlimages/curl -- curl -X POST http://$CLUSTERIP/chat -H "accept: application/json" -H "Content-Type: application/json" -d "{\"prompt\":\"YOUR QUESTION HERE\"}" +$ kubectl get svc workspace-phi-3-5-mini +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +workspace-phi-3-5-mini ClusterIP 80/TCP,29500/TCP 10m + +export CLUSTERIP=$(kubectl get svc workspace-phi-3-5-mini -o jsonpath="{.spec.clusterIPs[0]}") +$ kubectl run -it --rm --restart=Never curl --image=curlimages/curl -- curl -X POST http://$CLUSTERIP/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "phi-3.5-mini-instruct", + "prompt": "What is kubernetes?", + "max_tokens": 7, + "temperature": 0 + }' ``` ## Usage diff --git a/charts/kaito/workspace/values.yaml b/charts/kaito/workspace/values.yaml index 5f52cf3de..bed646dfb 100644 --- a/charts/kaito/workspace/values.yaml +++ b/charts/kaito/workspace/values.yaml @@ -17,6 +17,7 @@ securityContext: - "ALL" featureGates: Karpenter: "false" + vLLM: "true" webhook: port: 9443 presetRegistryName: mcr.microsoft.com/aks/kaito diff --git a/docs/inference/README.md b/docs/inference/README.md index 04014e8fa..ef7032a75 100644 --- a/docs/inference/README.md +++ b/docs/inference/README.md @@ -12,7 +12,7 @@ kind: Workspace metadata: name: workspace-falcon-7b resource: - instanceType: "Standard_NC12s_v3" + instanceType: "Standard_NC6s_v3" labelSelector: matchLabels: apps: falcon-7b @@ -40,6 +40,29 @@ Next, the user needs to add the node names in the `preferredNodes` field in the > [!IMPORTANT] > The node objects of the preferred nodes need to contain the same matching labels as specified in the `resource` spec. Otherwise, the Kaito controller would not recognize them. +### Inference runtime selection + +KAITO now supports both [vLLM](https://github.com/vllm-project/vllm) and [transformers](https://github.com/huggingface/transformers) runtime. `vLLM` provides better serving latency and throughput. `transformers` provides more compatibility with models in the Huggingface model hub. + +From KAITO v0.4.0, the default runtime is switched to `vLLM`. If you want to use `transformers` runtime, you can specify the runtime in the `inference` spec using an annotation. For example, + +```yaml +apiVersion: kaito.sh/v1alpha1 +kind: Workspace +metadata: + name: workspace-falcon-7b + annotations: + kaito.sh/runtime: "transformers" +resource: + instanceType: "Standard_NC12s_v3" + labelSelector: + matchLabels: + apps: falcon-7b +inference: + preset: + name: "falcon-7b" +``` + ### Inference with LoRA adapters Kaito also supports running the inference workload with LoRA adapters produced by [model fine-tuning jobs](../tuning/README.md). Users can specify one or more adapters in the `adapters` field of the `inference` spec. For example, @@ -69,6 +92,9 @@ Currently, only images are supported as adapter sources. The `strength` field sp For detailed `InferenceSpec` API definitions, refer to the [documentation](https://github.com/kaito-project/kaito/blob/2ccc93daf9d5385649f3f219ff131ee7c9c47f3e/api/v1alpha1/workspace_types.go#L75). +### Inference API + +The OpenAPI specification for the inference API is available at [vLLM API](../../presets/workspace/inference/vllm/api_spec.json), [transformers API](../../presets/workspace/inference/text-generation/api_spec.json). # Inference workload diff --git a/examples/inference/kaito_workspace_phi_3.5-instruct.yaml b/examples/inference/kaito_workspace_phi_3.5-instruct.yaml new file mode 100644 index 000000000..294f89819 --- /dev/null +++ b/examples/inference/kaito_workspace_phi_3.5-instruct.yaml @@ -0,0 +1,12 @@ +apiVersion: kaito.sh/v1alpha1 +kind: Workspace +metadata: + name: workspace-phi-3-5-mini +resource: + instanceType: "Standard_NC12s_v3" + labelSelector: + matchLabels: + apps: phi-3-5 +inference: + preset: + name: phi-3.5-mini-instruct From fa2c1554c127917906a79aae301146387c129c82 Mon Sep 17 00:00:00 2001 From: zhuangqh Date: Thu, 5 Dec 2024 14:48:15 +1100 Subject: [PATCH 2/2] update --- README.md | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 60a82b0b1..0a5fe3b50 100644 --- a/README.md +++ b/README.md @@ -72,11 +72,30 @@ workspace-phi-3-5-mini Standard_NC12s_v3 True True Next, one can find the inference service's cluster ip and use a temporal `curl` pod to test the service endpoint in the cluster. ```sh +# find service endpoint $ kubectl get svc workspace-phi-3-5-mini NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE workspace-phi-3-5-mini ClusterIP 80/TCP,29500/TCP 10m - -export CLUSTERIP=$(kubectl get svc workspace-phi-3-5-mini -o jsonpath="{.spec.clusterIPs[0]}") +$ export CLUSTERIP=$(kubectl get svc workspace-phi-3-5-mini -o jsonpath="{.spec.clusterIPs[0]}") + +# find availalbe models +$ kubectl run -it --rm --restart=Never curl --image=curlimages/curl -- curl -s http://$CLUSTERIP/v1/models | jq +{ + "object": "list", + "data": [ + { + "id": "phi-3.5-mini-instruct", + "object": "model", + "created": 1733370094, + "owned_by": "vllm", + "root": "/workspace/vllm/weights", + "parent": null, + "max_model_len": 16384 + } + ] +} + +# make an inference call using the model id (phi-3.5-mini-instruct) from previous step $ kubectl run -it --rm --restart=Never curl --image=curlimages/curl -- curl -X POST http://$CLUSTERIP/v1/completions \ -H "Content-Type: application/json" \ -d '{