Skip to content

Commit

Permalink
Add e2e test for nodeclaim
Browse files Browse the repository at this point in the history
Signed-off-by: Heba Elayoty <[email protected]>
  • Loading branch information
helayoty committed May 11, 2024
1 parent 0ac5bf3 commit 60e4b8c
Show file tree
Hide file tree
Showing 13 changed files with 432 additions and 193 deletions.
82 changes: 31 additions & 51 deletions .github/workflows/e2e-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ jobs:
environment: e2e-test
env:
GO_VERSION: "1.22"

KARPENTER_NAMESPACE: "karpenter"
GPU_PROVISIONER_NAMESPACE: "gpu-provisioner"
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
Expand Down Expand Up @@ -110,21 +111,6 @@ jobs:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }}

- name: Create Azure Identity
uses: azure/[email protected]
with:
inlineScript: |
az identity create --name ${{ inputs.suite }}Identity --resource-group ${{ env.CLUSTER_NAME }}
- name: build KAITO image
if: ${{ !inputs.isRelease }}
shell: bash
run: |
make docker-build-kaito
env:
REGISTRY: ${{ env.REGISTRY }}
VERSION: ${{ env.VERSION }}

- name: create cluster
shell: bash
run: |
Expand All @@ -136,12 +122,20 @@ jobs:
AZURE_LOCATION: ${{ inputs.region }}
AKS_K8S_VERSION: ${{ inputs.k8s_version }}

- name: Create Identities and Permissions for ${{ inputs.suite }}
shell: bash
run: |
make generate-identities
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
TEST_SUITE: ${{ inputs.suite }}

- name: Install gpu-provisioner helm chart
if: ${{ inputs.suite == 'gpuprov' }}
if: ${{ inputs.suite == 'gpuprovisioner' }}
shell: bash
run: |
make gpu-provisioner-helm
kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
Expand All @@ -154,45 +148,22 @@ jobs:
shell: bash
run: |
make azure-karpenter-helm
# taint nodes as karpenter-system
kubectl taint nodes CriticalAddonsOnly=true:NoSchedule --all
kubectl wait --for=condition=available deploy "karpenter" -n karpenter --timeout=300s
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
AZURE_TENANT_ID: ${{ secrets.E2E_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }}
KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }}
KARPENTER_NAMESPACE: ${{ env.KARPENTER_NAMESPACE }}

- uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0
with:
client-id: ${{ secrets.E2E_CLIENT_ID }}
tenant-id: ${{ secrets.E2E_TENANT_ID }}
subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }}

- name: Create Role Assignment
uses: azure/[email protected]
with:
inlineScript: |
IDENTITY_PRINCIPAL_ID="$(az identity show --name ${{ inputs.suite }}Identity --resource-group ${{ env.CLUSTER_NAME }} --query 'principalId' -otsv)"
az role assignment create --assignee ${IDENTITY_PRINCIPAL_ID} --scope "/subscriptions/${{ secrets.E2E_SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "Contributor"
if [ "${{ inputs.suite }}" == "azkarpenter" ]; then
fi
- name: Create Azure Federated Identity
uses: azure/[email protected]
with:
inlineScript: |
AKS_OIDC_ISSUER="$(az aks show -n "${{ env.CLUSTER_NAME }}" -g "${{ env.CLUSTER_NAME }}" --query 'oidcIssuerProfile.issuerUrl' -otsv)"
if [ "${{ inputs.suite }}" == "gpuprov" ]; then
az identity federated-credential create --name ${{ inputs.suite }}-fed --identity-name ${{ inputs.suite }}Identity --resource-group "${{ env.CLUSTER_NAME }}" \
--issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange
fi
if [ "${{ inputs.suite }}" == "azkarpenter" ]; then
az identity federated-credential create --name ${{ inputs.suite }}-fed --identity-name ${{ inputs.suite }}Identity --resource-group "${{ env.CLUSTER_NAME }}" \
--issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"karpenter:karpenter-sa" --audience api://AzureADTokenExchange
fi
- name: build KAITO image
if: ${{ !inputs.isRelease }}
shell: bash
run: |
make docker-build-kaito
env:
REGISTRY: ${{ env.REGISTRY }}
VERSION: ${{ env.VERSION }}

- name: Install KAITO Workspace helm chart
shell: bash
Expand All @@ -211,7 +182,15 @@ jobs:
--docker-server=${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io \
--docker-username=${{ secrets.E2E_ACR_AMRT_USERNAME }} \
--docker-password=${{ secrets.E2E_ACR_AMRT_PASSWORD }}
- name: Log ${{ inputs.suite }}
run: |
if [ "${{ inputs.suite }}" == "gpuprovisioner" ]; then
kubectl logs -f -n "${{ env.GPU_PROVISIONER_NAMESPACE }}" -l app.kubernetes.io/name=gpu-provisioner -c controller
else
kubectl logs -f -n "${{ env.KARPENTER_NAMESPACE }}" -l app.kubernetes.io/name=karpenter -c controller
fi
- name: Log kaito-workspace
run: |
kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {}
Expand All @@ -224,6 +203,7 @@ jobs:
RUN_LLAMA_13B: ${{ env.RUN_LLAMA_13B }}
AI_MODELS_REGISTRY: ${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io
AI_MODELS_REGISTRY_SECRET: ${{ secrets.E2E_AMRT_SECRET_NAME }}
TEST_SUITE: ${{ inputs.suite }}

- name: Cleanup e2e resources
if: ${{ always() }}
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/kaito-e2e.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
name: pr-e2e-test

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

on:
pull_request:
paths-ignore: ['docs/**', '**.md', '**.mdx', '**.png', '**.jpg']
Expand All @@ -16,7 +20,7 @@ jobs:
strategy:
fail-fast: false
matrix:
suite: [ gpuprov, azkarpenter ]
suite: [ gpuprovisioner, azkarpenter ]
permissions:
contents: read
id-token: write
Expand Down
74 changes: 21 additions & 53 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ GOLANGCI_LINT_BIN := golangci-lint
GOLANGCI_LINT := $(abspath $(TOOLS_BIN_DIR)/$(GOLANGCI_LINT_BIN)-$(GOLANGCI_LINT_VER))

E2E_TEST_BIN := e2e.test
KARPENTER_E2E_TEST_BIN := karpenter-e2e.test
E2E_TEST := $(BIN_DIR)/$(E2E_TEST_BIN)
KARPENTER_E2E_TEST := $(BIN_DIR)/$(KARPENTER_E2E_TEST_BIN)

GINKGO_VER := v2.17.1
GINKGO_BIN := ginkgo
GINKGO := $(TOOLS_BIN_DIR)/$(GINKGO_BIN)-$(GINKGO_VER)
TEST_SUITE ?= gpuprovisioner

AZURE_SUBSCRIPTION_ID ?= $(AZURE_SUBSCRIPTION_ID)
AZURE_LOCATION ?= eastus
Expand All @@ -33,14 +32,13 @@ AZURE_CLUSTER_NAME ?= kaito-demo
AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION)
GPU_PROVISIONER_NAMESPACE ?= gpu-provisioner
KAITO_NAMESPACE ?= kaito-workspace
GPU_PROVISIONER_MSI_NAME ?= gpuIdentity
GPU_PROVISIONER_MSI_NAME ?= gpuprovisionerIdentity

## Karpenter parameters
## Azure Karpenter parameters
KARPENTER_NAMESPACE ?= karpenter
KARPENTER_SERVICE_ACCOUNT_NAME ?= karpenter-sa
KARPENTER_SA_NAME ?= karpenter-sa
KARPENTER_VERSION ?= 0.4.0
AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME ?= karpenterIdentity
KARPENTER_FEDERATED_IDENTITY_CREDENTIAL_NAME ?= karpenter-fed
AZURE_KARPENTER_MSI_NAME ?= azkarpenterIdentity

RUN_LLAMA_13B ?= false
AI_MODELS_REGISTRY ?= modelregistry.azurecr.io
Expand Down Expand Up @@ -98,19 +96,11 @@ $(E2E_TEST):
.PHONY: kaito-workspace-e2e-test
kaito-workspace-e2e-test: $(E2E_TEST) $(GINKGO)
AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \
AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_NAMESPACE=$(GPU_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \
AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_PROVISIONER_NAMESPACE=$(GPU_PROVISIONER_NAMESPACE) \
KARPENTER_NAMESPACE=$(KARPENTER_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) TEST_SUITE=$(TEST_SUITE) \
SUPPORTED_MODELS_YAML_PATH=$(SUPPORTED_MODELS_YAML_PATH) \
$(GINKGO) -v -trace $(GINKGO_ARGS) $(E2E_TEST)

$(KARPENTER_E2E_TEST):
(cd test/e2e/karpenter && go test -c . -o $(KARPENTER_E2E_TEST))

.PHONY: kaito-karpenter-e2e-test
kaito-karpenter-e2e-test: $(E2E_TEST) $(GINKGO)
AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \
AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) KARPENTER=$(KARPENTER_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \
$(GINKGO) -v -trace $(GINKGO_ARGS) $(KARPENTER_E2E_TEST)

## --------------------------------------
## Azure resources
## --------------------------------------
Expand Down Expand Up @@ -149,11 +139,9 @@ create-aks-cluster-for-karpenter: ## Create test AKS cluster (with msi, cilium,
--enable-managed-identity --enable-oidc-issuer --enable-workload-identity -o none
az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --overwrite-existing


## --------------------------------------
## Image Docker Build
## --------------------------------------

BUILDX_BUILDER_NAME ?= img-builder
OUTPUT_TYPE ?= type=registry
QEMU_VERSION ?= 5.2.0-2
Expand Down Expand Up @@ -198,51 +186,33 @@ az-patch-install-helm: ## Update Azure client env vars and settings in helm valu

helm install kaito-workspace ./charts/kaito/workspace --namespace $(KAITO_NAMESPACE) --create-namespace

generate-identities: ## Create identities for the provisioner component.
./hack/deploy/generate-identities.sh \
$(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(TEST_SUITE)

## --------------------------------------
## gpu-provider installation
## --------------------------------------
gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner
az identity create --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP)

IDENTITY_PRINCIPAL_ID=$(shell az identity show --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId');\
az role assignment create --assignee $$IDENTITY_PRINCIPAL_ID --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor"

AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\
az identity federated-credential create --name gpu-federatecredential --identity-name $(GPU_PROVISIONER_MSI_NAME) --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \
--subject system:serviceaccount:"$(GPU_PROVISIONER_NAMESPACE):$(GPU_PROVISIONER_NAMESPACE)" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID)

.PHONY: gpu-provisioner-helm
gpu-provisioner-helm: ## Update Azure client env vars and settings in helm values.yml
curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh
chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME)
chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) \
$(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME)

helm install $(GPU_PROVISIONER_NAMESPACE) --values gpu-provisioner-values.yaml --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) --wait \
helm install gpu-provisioner \
--values gpu-provisioner-values.yaml \
--set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) \
https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz

kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s
## --------------------------------------
## Azure Karpenter Installation
## --------------------------------------
karpenter-identity-perm:
az identity create --name $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) --resource-group $(AZURE_RESOURCE_GROUP)

KARPENTER_USER_ASSIGNED_PRINCIPAL_ID=$(shell az identity show -n "$(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME)" \
-g "$(AZURE_RESOURCE_GROUP)" --query 'principalId');\
az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Virtual Machine Contributor";\
az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Network Contributor";\
az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Managed Identity Operator"

AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\
az identity federated-credential create --name $(KARPENTER_FEDERATED_IDENTITY_CREDENTIAL_NAME) \
--identity-name $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) \
--resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \
--subject system:serviceaccount:"$(KARPENTER_NAMESPACE):$(KARPENTER_SERVICE_ACCOUNT_NAME)" \
--audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID)

.PHONY: azure-karpenter-helm
azure-karpenter-helm: ## Update Azure client env vars and settings in helm values.yml
curl -sO https://raw.githubusercontent.com/Azure/karpenter-provider-azure/main/hack/deploy/configure-values.sh
chmod +x ./configure-values.sh && ./configure-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) \
$(KARPENTER_SERVICE_ACCOUNT_NAME) $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME)
chmod +x ./configure-values.sh && ./configure-values.sh $(AZURE_CLUSTER_NAME) \
$(AZURE_RESOURCE_GROUP) $(KARPENTER_SA_NAME) $(AZURE_KARPENTER_MSI_NAME)

helm upgrade --install karpenter oci://mcr.microsoft.com/aks/karpenter/karpenter \
--version "$(KARPENTER_VERSION)" \
Expand All @@ -251,10 +221,9 @@ azure-karpenter-helm: ## Update Azure client env vars and settings in helm valu
--set controller.resources.requests.cpu=1 \
--set controller.resources.requests.memory=1Gi \
--set controller.resources.limits.cpu=1 \
--set controller.resources.limits.memory=1Gi \
--wait
--set controller.resources.limits.memory=1Gi

kubectl logs -f -n "$(KARPENTER_NAMESPACE)" -l app.kubernetes.io/name=karpenter -c controller
kubectl wait --for=condition=available deploy "karpenter" -n karpenter --timeout=300s

##@ Development
.PHONY: manifests
Expand All @@ -265,7 +234,6 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."


##@ Build
.PHONY: build
build: manifests generate fmt vet ## Build manager binary.
Expand Down
2 changes: 1 addition & 1 deletion charts/kaito/workspace/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Declare variables to be passed into your templates.
replicaCount: 1
image:
repository: mcr.microsoft.com/aks/kaito/workspace
repository: YOUR_REGISTRY/workspace
pullPolicy: IfNotPresent
tag: 0.2.2
imagePullSecrets: []
Expand Down
34 changes: 34 additions & 0 deletions configure-helm-values.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash
set -euo pipefail

# https://github.com/Azure/karpenter-provider-azure/blob/2beb773cbd3134eeabb8c96b72a130b86b1a91e1/hack/deploy/configure-values.sh

# This script interrogates the AKS cluster and Azure resources to generate
# the gpu-provisioner-values.yaml file using the gpu-provisioner-values-template.yaml file as a template.

if [ "$#" -ne 3 ]; then
echo "Usage: $0 <cluster-name> <resource-group> <gpu-provisioner-user-assigned-identity-name>"
exit 1
fi

echo "Configuring gpu-provisioner-values.yaml for cluster $1 in resource group $2 ..."

CLUSTER_NAME=$1
AZURE_RESOURCE_GROUP=$2
AZURE_GPU_PROVISIONER_USER_ASSIGNED_IDENTITY_NAME=$3

AKS_JSON=$(az aks show --name "$CLUSTER_NAME" --resource-group "$AZURE_RESOURCE_GROUP")
AZURE_LOCATION=$(jq -r ".location" <<< "$AKS_JSON")
AZURE_RESOURCE_GROUP_MC=$(jq -r ".nodeResourceGroup" <<< "$AKS_JSON")
AZURE_TENANT_ID=$(az account show |jq -r ".tenantId")


GPU_PROVISIONER_USER_ASSIGNED_CLIENT_ID=$(az identity show --resource-group "${AZURE_RESOURCE_GROUP}" --name "${AZURE_GPU_PROVISIONER_USER_ASSIGNED_IDENTITY_NAME}" --query 'clientId' -otsv)

export CLUSTER_NAME AZURE_LOCATION AZURE_RESOURCE_GROUP_MC GPU_PROVISIONER_USER_ASSIGNED_CLIENT_ID AZURE_TENANT_ID

# get gpu-provisioner-values-template.yaml, if not already present (e.g. outside of repo context)
if [ ! -f gpu-provisioner-values-template.yaml ]; then
curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/gpu-provisioner-values-template.yaml
fi
yq '(.. | select(tag == "!!str")) |= envsubst(nu)' gpu-provisioner-values-template.yaml > gpu-provisioner-values.yaml
Loading

0 comments on commit 60e4b8c

Please sign in to comment.