Merge branch 'main' into sdahal/aws_documentation

kaito-project · Nov 25, 2024 · 42d8f0a · 42d8f0a
2 parents aa5eec3 + 511dfa1
commit 42d8f0a
Show file tree

Hide file tree

Showing 47 changed files with 1,349 additions and 80 deletions.
diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
@@ -4,7 +4,7 @@
       {
         "name": "falcon-7b",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
@@ -21,39 +21,39 @@
       {
         "name": "falcon-7b-instruct",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
       },
       {
         "name": "falcon-40b",
         "node-count": 1,
-        "node-vm-size": "Standard_NC24s_v3",
+        "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
         "loads_adapter": false
       },
       {
         "name": "falcon-40b-instruct",
         "node-count": 1,
-        "node-vm-size": "Standard_NC24s_v3",
+        "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
         "loads_adapter": false
       },
       {
         "name": "mistral-7b",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
       },
       {
         "name": "mistral-7b-instruct",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -32,7 +32,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - uses: actions/setup-go@v5.0.2
+      - uses: actions/setup-go@v5.1.0
         with:
           go-version-file: 'go.mod'
 

diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml
@@ -21,7 +21,7 @@ jobs:
           egress-policy: audit
 
       - name: Set up Go ${{ env.GO_VERSION }}
-        uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
+        uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
         with:
           go-version: ${{ env.GO_VERSION  }}
 

diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
@@ -24,4 +24,4 @@ jobs:
       - name: 'Checkout Repository'
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: 'Dependency Review'
-        uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4
+        uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -15,12 +15,17 @@ on:
                 type: boolean
                 default: false
                 description: "Test all Phi models for E2E"
+            test-on-vllm:
+                type: boolean
+                default: false
+                description: "Test on VLLM runtime"
 
 env:
     GO_VERSION: "1.22"
     BRANCH_NAME: ${{ github.head_ref || github.ref_name}} 
     FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}
     FORCE_RUN_ALL_PHI:  ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }}
+    RUNTIME: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.test-on-vllm == 'true') && 'vllm' || 'hf' }}
 
 permissions:
     id-token: write
@@ -229,10 +234,11 @@ jobs:
       
       - name: Replace IP and Deploy Resource to K8s
         run: |
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
+            POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
+            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
 
       - name: Wait for Resource to be ready
         run: |
@@ -243,20 +249,27 @@ jobs:
         run: |
             POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
             kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
-          
-      - name: Test home endpoint
+
+      - name: Install testing commands
         run: |
-            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
 
       - name: Test healthz endpoint
         run: |
-            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
-    
+            if [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s http://localhost:5000/healthz
+            else
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s http://localhost:5000/health
+            fi
       - name: Test inference endpoint
         run: |
+            echo "Testing inference for ${{ matrix.model.name }}"
             if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
-                echo "Testing inference for ${{ matrix.model.name }}"
-                curl -X POST \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
                     "input_data": {
@@ -274,10 +287,10 @@ jobs:
                         ]
                     }
                 }' \
-                http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat
+                http://localhost:5000/chat
             elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                echo "Testing inference for ${{ matrix.model.name }}"
-                curl -X POST \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
                     "prompts": [
@@ -290,10 +303,29 @@ jobs:
                         "max_gen_len": 128
                     }
                 }' \
-                http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate
+                http://localhost:5000/generate
+            elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
+                -H "accept: application/json" \
+                -H "Content-Type: application/json" \
+                -d '{
+                    "model": "test",
+                    "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant."
+                    },
+                    {
+                        "role": "user",
+                        "content": "Hello!"
+                    }
+                    ]
+                    }' \
+                http://localhost:5000/v1/chat/completions
             else
-                echo "Testing inference for ${{ matrix.model.name }}"
-                curl -X POST \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -327,7 +359,7 @@ jobs:
                             "remove_invalid_values":null
                         }
                     }' \
-                http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat                
+                http://localhost:5000/chat
             fi
       
       - name: Cleanup
@@ -340,6 +372,7 @@ jobs:
                 
                 # Check and Delete K8s Resource (Deployment or StatefulSet)
                 if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
+                    kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
                     kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
                 fi
             fi

diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml
@@ -72,7 +72,7 @@ jobs:
         run: sudo rm -rf ~/go/pkg/mod
 
       - name: Set up Go ${{ env.GO_VERSION }}
-        uses: actions/setup-go@v5.0.2
+        uses: actions/setup-go@v5.1.0
         with:
           go-version: ${{ env.GO_VERSION  }}
 

diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py
@@ -21,7 +21,7 @@ def read_yaml(file_path):
 YAML_PR = read_yaml(supp_models_yaml)
 # Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}}
 MODELS = {model['name']: model for model in YAML_PR['models']}
-KAITO_REPO_URL = "https://github.com/kaito-repo/kaito.git"
+KAITO_REPO_URL = "https://github.com/kaito-project/kaito.git"
 
 def set_multiline_output(name, value):
     with open(os.environ['GITHUB_OUTPUT'], 'a') as fh:

diff --git a/.github/workflows/lint-go.yml b/.github/workflows/lint-go.yml
@@ -35,7 +35,7 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Go ${{ env.GO_VERSION }}
-        uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
+        uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
         with:
           go-version: ${{ env.GO_VERSION }}
 

diff --git a/.github/workflows/publish-mcr-image.yml b/.github/workflows/publish-mcr-image.yml
@@ -18,7 +18,7 @@ jobs:
     environment: publish-mcr
     steps:
       - name: Set up Go ${{ env.GO_VERSION }}
-        uses: actions/setup-go@v5.0.2
+        uses: actions/setup-go@v5.1.0
         with:
           go-version: ${{ env.GO_VERSION  }}
 

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -36,7 +36,7 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Go ${{ env.GO_VERSION }}
-        uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
+        uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
         with:
           go-version: ${{ env.GO_VERSION  }}
       - name: Generate APIs
@@ -52,7 +52,7 @@ jobs:
           DEVICE=cpu make inference-api-e2e
 
       - name: Upload Codecov report
-        uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238 # v4.6.0
+        uses: codecov/codecov-action@015f24e6818733317a2da2edd6290ab26238649a # v5.0.7
         with:
           ## Comma-separated list of files to upload
           files: ./coverage.txt

diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ Kaito follows the classic Kubernetes Custom Resource Definition(CRD)/controller
 The above figure presents the Kaito architecture overview. Its major components consist of:
 
 - **Workspace controller**: It reconciles the `workspace` custom resource, creates `machine` (explained below) custom resources to trigger node auto provisioning, and creates the inference or tuning workload (`deployment`, `statefulset` or `job`) based on the model preset configurations.
-- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [gpu-provisioner helm chart](https://github.com/Azure/gpu-provisioner/tree/main/charts/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Resource Manager REST APIs to add new GPU nodes to the AKS cluster.
+- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [gpu-provisioner helm chart](https://github.com/Azure/gpu-provisioner/tree/main/charts/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Resource Manager REST APIs to add new GPU nodes to the AKS or AKS Arc cluster.
 > Note: The [*gpu-provisioner*](https://github.com/Azure/gpu-provisioner) is an open sourced component. It can be replaced by other controllers if they support [Karpenter-core](https://sigs.k8s.io/karpenter) APIs.
 
 ## Installation

diff --git a/docs/custom-model-integration/Dockerfile.reference b/docs/custom-model-integration/Dockerfile.reference
@@ -1,4 +1,4 @@
-FROM python:3.10-slim@sha256:684b1aaf96a7942b3c3af438d162e0baa3510aa7af25ad76d238e0c746bdec79
+FROM python:3.12-slim
 
 # Specify the repository source URL for reference and access in Kaito packages.
 LABEL org.opencontainers.image.source=https://github.com/kaito-project/kaito

diff --git a/docs/custom-model-integration/custom-model-integration-guide.md b/docs/custom-model-integration/custom-model-integration-guide.md
@@ -1,7 +1,7 @@
 # Custom Model Integration Guide
 
 ## Option 1: Use Pre-Built Docker Image Without Model Weights
-If you want to avoid building a Docker image with model weights, use our pre-built reference image (`ghcr.io/azure/kaito/llm-reference-preset:latest`). This image, built with [Dockerfile.reference](./Dockerfile.reference), dynamically downloads model weights from HuggingFace at runtime, reducing the need to create and maintain custom images.
+If you want to avoid building a Docker image with model weights, use our pre-built reference image (`ghcr.io/kaito-project/kaito/llm-reference-preset:latest`). This image, built with [Dockerfile.reference](./Dockerfile.reference), dynamically downloads model weights from HuggingFace at runtime, reducing the need to create and maintain custom images.
 
 
 - **[Sample Deployment YAML](./reference-image-deployment.yaml)**

diff --git a/docs/installation.md b/docs/installation.md
@@ -105,7 +105,8 @@ You can run the following commands to verify the installation of the controllers
 Check status of the Helm chart installations.
 
 ```bash
-helm list -n default
+helm list -n kaito-workspace
+helm list -n gpu-provisioner
 ```
 
 Check status of the `workspace`.