Skip to content

Commit

Permalink
Merge branch 'main' into sdahal/aws_documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
smritidahal653 authored Nov 25, 2024
2 parents aa5eec3 + 511dfa1 commit 42d8f0a
Show file tree
Hide file tree
Showing 47 changed files with 1,349 additions and 80 deletions.
12 changes: 6 additions & 6 deletions .github/e2e-preset-configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
{
"name": "falcon-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
Expand All @@ -21,39 +21,39 @@
{
"name": "falcon-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "falcon-40b",
"node-count": 1,
"node-vm-size": "Standard_NC24s_v3",
"node-vm-size": "Standard_NC48ads_A100_v4",
"node-osdisk-size": 400,
"OSS": true,
"loads_adapter": false
},
{
"name": "falcon-40b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC24s_v3",
"node-vm-size": "Standard_NC48ads_A100_v4",
"node-osdisk-size": 400,
"OSS": true,
"loads_adapter": false
},
{
"name": "mistral-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "mistral-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
submodules: true
fetch-depth: 0

- uses: actions/setup-go@v5.0.2
- uses: actions/setup-go@v5.1.0
with:
go-version-file: 'go.mod'

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/create-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
egress-policy: audit

- name: Set up Go ${{ env.GO_VERSION }}
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
with:
go-version: ${{ env.GO_VERSION }}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dependency-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ jobs:
- name: 'Checkout Repository'
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 'Dependency Review'
uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4
uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0
69 changes: 51 additions & 18 deletions .github/workflows/e2e-preset-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ on:
type: boolean
default: false
description: "Test all Phi models for E2E"
test-on-vllm:
type: boolean
default: false
description: "Test on VLLM runtime"

env:
GO_VERSION: "1.22"
BRANCH_NAME: ${{ github.head_ref || github.ref_name}}
FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}
FORCE_RUN_ALL_PHI: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }}
RUNTIME: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.test-on-vllm == 'true') && 'vllm' || 'hf' }}

permissions:
id-token: write
Expand Down Expand Up @@ -229,10 +234,11 @@ jobs:
- name: Replace IP and Deploy Resource to K8s
run: |
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
- name: Wait for Resource to be ready
run: |
Expand All @@ -243,20 +249,27 @@ jobs:
run: |
POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
- name: Test home endpoint
- name: Install testing commands
run: |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
- name: Test healthz endpoint
run: |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
if [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s http://localhost:5000/healthz
else
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s http://localhost:5000/health
fi
- name: Test inference endpoint
run: |
echo "Testing inference for ${{ matrix.model.name }}"
if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "Content-Type: application/json" \
-d '{
"input_data": {
Expand All @@ -274,10 +287,10 @@ jobs:
]
}
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat
http://localhost:5000/chat
elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "Content-Type: application/json" \
-d '{
"prompts": [
Expand All @@ -290,10 +303,29 @@ jobs:
"max_gen_len": 128
}
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate
http://localhost:5000/generate
elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
"model": "test",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Hello!"
}
]
}' \
http://localhost:5000/v1/chat/completions
else
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
Expand Down Expand Up @@ -327,7 +359,7 @@ jobs:
"remove_invalid_values":null
}
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat
http://localhost:5000/chat
fi
- name: Cleanup
Expand All @@ -340,6 +372,7 @@ jobs:
# Check and Delete K8s Resource (Deployment or StatefulSet)
if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
fi
fi
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/e2e-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ jobs:
run: sudo rm -rf ~/go/pkg/mod

- name: Set up Go ${{ env.GO_VERSION }}
uses: actions/setup-go@v5.0.2
uses: actions/setup-go@v5.1.0
with:
go-version: ${{ env.GO_VERSION }}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kind-cluster/determine_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def read_yaml(file_path):
YAML_PR = read_yaml(supp_models_yaml)
# Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}}
MODELS = {model['name']: model for model in YAML_PR['models']}
KAITO_REPO_URL = "https://github.com/kaito-repo/kaito.git"
KAITO_REPO_URL = "https://github.com/kaito-project/kaito.git"

def set_multiline_output(name, value):
with open(os.environ['GITHUB_OUTPUT'], 'a') as fh:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint-go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
fetch-depth: 0

- name: Set up Go ${{ env.GO_VERSION }}
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
with:
go-version: ${{ env.GO_VERSION }}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-mcr-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
environment: publish-mcr
steps:
- name: Set up Go ${{ env.GO_VERSION }}
uses: actions/setup-go@v5.0.2
uses: actions/setup-go@v5.1.0
with:
go-version: ${{ env.GO_VERSION }}

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
fetch-depth: 0

- name: Set up Go ${{ env.GO_VERSION }}
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
with:
go-version: ${{ env.GO_VERSION }}
- name: Generate APIs
Expand All @@ -52,7 +52,7 @@ jobs:
DEVICE=cpu make inference-api-e2e
- name: Upload Codecov report
uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238 # v4.6.0
uses: codecov/codecov-action@015f24e6818733317a2da2edd6290ab26238649a # v5.0.7
with:
## Comma-separated list of files to upload
files: ./coverage.txt
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Kaito follows the classic Kubernetes Custom Resource Definition(CRD)/controller
The above figure presents the Kaito architecture overview. Its major components consist of:

- **Workspace controller**: It reconciles the `workspace` custom resource, creates `machine` (explained below) custom resources to trigger node auto provisioning, and creates the inference or tuning workload (`deployment`, `statefulset` or `job`) based on the model preset configurations.
- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [gpu-provisioner helm chart](https://github.com/Azure/gpu-provisioner/tree/main/charts/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Resource Manager REST APIs to add new GPU nodes to the AKS cluster.
- **Node provisioner controller**: The controller's name is *gpu-provisioner* in [gpu-provisioner helm chart](https://github.com/Azure/gpu-provisioner/tree/main/charts/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Resource Manager REST APIs to add new GPU nodes to the AKS or AKS Arc cluster.
> Note: The [*gpu-provisioner*](https://github.com/Azure/gpu-provisioner) is an open sourced component. It can be replaced by other controllers if they support [Karpenter-core](https://sigs.k8s.io/karpenter) APIs.
## Installation
Expand Down
2 changes: 1 addition & 1 deletion docs/custom-model-integration/Dockerfile.reference
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-slim@sha256:684b1aaf96a7942b3c3af438d162e0baa3510aa7af25ad76d238e0c746bdec79
FROM python:3.12-slim

# Specify the repository source URL for reference and access in Kaito packages.
LABEL org.opencontainers.image.source=https://github.com/kaito-project/kaito
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Custom Model Integration Guide

## Option 1: Use Pre-Built Docker Image Without Model Weights
If you want to avoid building a Docker image with model weights, use our pre-built reference image (`ghcr.io/azure/kaito/llm-reference-preset:latest`). This image, built with [Dockerfile.reference](./Dockerfile.reference), dynamically downloads model weights from HuggingFace at runtime, reducing the need to create and maintain custom images.
If you want to avoid building a Docker image with model weights, use our pre-built reference image (`ghcr.io/kaito-project/kaito/llm-reference-preset:latest`). This image, built with [Dockerfile.reference](./Dockerfile.reference), dynamically downloads model weights from HuggingFace at runtime, reducing the need to create and maintain custom images.


- **[Sample Deployment YAML](./reference-image-deployment.yaml)**
Expand Down
3 changes: 2 additions & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ You can run the following commands to verify the installation of the controllers
Check status of the Helm chart installations.

```bash
helm list -n default
helm list -n kaito-workspace
helm list -n gpu-provisioner
```

Check status of the `workspace`.
Expand Down
Loading

0 comments on commit 42d8f0a

Please sign in to comment.