From 3f79420630e9fde0a3a8fc349c9c87b62e092fc4 Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Wed, 8 May 2024 15:03:58 -0700 Subject: [PATCH] Add karpenter workflow Signed-off-by: Heba Elayoty --- .github/workflows/e2e-workflow.yml | 4 + .github/workflows/kaito-e2e.yml | 18 +- .github/workflows/karpenter-e2e-workflow.yml | 215 ++++++++++++++++++ Makefile | 219 ++++++++++++------- 4 files changed, 377 insertions(+), 79 deletions(-) create mode 100644 .github/workflows/karpenter-e2e-workflow.yml diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml index 3e5465b94..a49f10efa 100644 --- a/.github/workflows/e2e-workflow.yml +++ b/.github/workflows/e2e-workflow.yml @@ -1,5 +1,9 @@ name: kaito-e2e-workflow +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + on: workflow_call: inputs: diff --git a/.github/workflows/kaito-e2e.yml b/.github/workflows/kaito-e2e.yml index 1c1ba28d8..36cea30e4 100644 --- a/.github/workflows/kaito-e2e.yml +++ b/.github/workflows/kaito-e2e.yml @@ -1,9 +1,5 @@ name: pr-e2e-test -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - on: pull_request: paths-ignore: ['docs/**', '**.md', '**.mdx', '**.png', '**.jpg'] @@ -16,7 +12,7 @@ permissions: contents: read # This is required for actions/checkout jobs: - run-e2e: + run-kaito-gpu-provisioner-e2e: uses: ./.github/workflows/e2e-workflow.yml with: git_sha: ${{ github.event.pull_request.head.sha }} @@ -27,3 +23,15 @@ jobs: E2E_AMRT_SECRET_NAME: ${{ secrets.AMRT_SECRET_NAME }} E2E_ACR_AMRT_USERNAME: ${{ secrets.ACR_AMRT_USERNAME }} E2E_ACR_AMRT_PASSWORD: ${{ secrets.ACR_AMRT_PASSWORD }} + + run-kaito-karpenter-e2e: + uses: ./.github/workflows/karpenter-e2e-workflow.yml + with: + git_sha: ${{ github.event.pull_request.head.sha }} + secrets: + E2E_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + E2E_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + E2E_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + E2E_AMRT_SECRET_NAME: ${{ secrets.AMRT_SECRET_NAME }} + E2E_ACR_AMRT_USERNAME: ${{ secrets.ACR_AMRT_USERNAME }} + E2E_ACR_AMRT_PASSWORD: ${{ secrets.ACR_AMRT_PASSWORD }} diff --git a/.github/workflows/karpenter-e2e-workflow.yml b/.github/workflows/karpenter-e2e-workflow.yml new file mode 100644 index 000000000..8aa008c7b --- /dev/null +++ b/.github/workflows/karpenter-e2e-workflow.yml @@ -0,0 +1,215 @@ +name: kaito-karpenter-e2e-workflow + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +on: + workflow_call: + inputs: + git_sha: + type: string + required: true + tag: + type: string + isRelease: + type: boolean + default: false + registry: + type: string + region: + type: string + description: "the azure location to run the e2e test in" + default: "eastus" + k8s_version: + type: string + default: "1.29.2" + secrets: + E2E_CLIENT_ID: + required: true + E2E_TENANT_ID: + required: true + E2E_SUBSCRIPTION_ID: + required: true + E2E_AMRT_SECRET_NAME: + required: true + E2E_ACR_AMRT_USERNAME: + required: true + E2E_ACR_AMRT_PASSWORD: + required: true + +permissions: + contents: read # This is required for actions/checkout + +jobs: + azure-e2e-tests: + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write # This is required for requesting the JWT + environment: e2e-test + env: + GO_VERSION: "1.22" + + steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + ref: ${{ inputs.git_sha }} + + - name: Set e2e Resource and Cluster Name + run: | + rand=$(git rev-parse --short ${{ inputs.git_sha }}) + + if [ "$rand" = "" ]; then + rand=$RANDOM + fi + + echo "VERSION=${rand}" >> $GITHUB_ENV + echo "CLUSTER_NAME=kaito${rand}" >> $GITHUB_ENV + echo "RUN_LLAMA_13B=false" >> $GITHUB_ENV + echo "REGISTRY=kaito${rand}.azurecr.io" >> $GITHUB_ENV + + - name: Set Registry + if: ${{ inputs.isRelease }} + run: | + echo "REGISTRY=${{ inputs.registry }}" >> $GITHUB_ENV + echo "VERSION=$(echo ${{ inputs.tag }} | tr -d v)" >> $GITHUB_ENV + + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@v5 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Az login + uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0 + with: + client-id: ${{ secrets.E2E_CLIENT_ID }} + tenant-id: ${{ secrets.E2E_TENANT_ID }} + subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }} + + - uses: azure/setup-helm@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + id: install + + - name: Create Resource Group + shell: bash + run: | + make create-rg + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + + - name: Create ACR + shell: bash + run: | + make create-acr + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} + + - name: Create Karpenter Azure Identity + uses: azure/CLI@v1.0.9 + with: + inlineScript: | + az identity create --name karpentermsi --resource-group ${{ env.CLUSTER_NAME }} + + - name: build KAITO image + if: ${{ !inputs.isRelease }} + shell: bash + run: | + make docker-build-kaito + env: + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} + + - name: create cluster + shell: bash + run: | + make create-aks-cluster-with-kaito + env: + AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + AZURE_LOCATION: ${{ inputs.region }} + AKS_K8S_VERSION: ${{ inputs.k8s_version }} + + - name: Install karpenter Azure provider helm chart + shell: bash + run: | + make azure-karpenter-helm + kubectl wait --for=condition=available deploy "karpenter" -n karpenter --timeout=300s + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + AZURE_TENANT_ID: ${{ secrets.E2E_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }} + KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }} + + - uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0 + with: + client-id: ${{ secrets.E2E_CLIENT_ID }} + tenant-id: ${{ secrets.E2E_TENANT_ID }} + subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }} + + - name: Create Role Assignment + uses: azure/CLI@v1.0.9 + with: + inlineScript: | + KARPENTER_USER_ASSIGNED_PRINCIPAL_ID="$(az identity show --name karpentermsi --resource-group ${{ env.CLUSTER_NAME }} --query 'principalId' -otsv)" + for role in "Virtual Machine Contributor" "Network Contributor" "Managed Identity Operator"; do \ + az role assignment create --assignee "${KARPENTER_USER_ASSIGNED_PRINCIPAL_ID}" \ + --scope "/subscriptions/${{ secrets.E2E_SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "$role" + done + + - name: Create Azure Federated Identity + uses: azure/CLI@v1.0.9 + with: + inlineScript: | + AKS_OIDC_ISSUER="$(az aks show -n "${{ env.CLUSTER_NAME }}" -g "${{ env.CLUSTER_NAME }}" --query 'oidcIssuerProfile.issuerUrl' -otsv)" + az identity federated-credential create --name karpenter-fed --identity-name karpentermsi --resource-group "${{ env.CLUSTER_NAME }}" \ + --issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"karpenter:karpenter-sa" --audience api://AzureADTokenExchange + + - name: Install KAITO Workspace helm chart + shell: bash + run: | + make az-patch-install-helm + kubectl wait --for=condition=available deploy "kaito-workspace" -n kaito-workspace --timeout=300s + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} + + - name: Add Secret Credentials + run: | + kubectl create secret docker-registry ${{ secrets.E2E_AMRT_SECRET_NAME }} \ + --docker-server=${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io \ + --docker-username=${{ secrets.E2E_ACR_AMRT_USERNAME }} \ + --docker-password=${{ secrets.E2E_ACR_AMRT_PASSWORD }} + + - name: Log kaito-workspace + run: | + kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {} + + - name: Run e2e test + run: | + make kaito-karpenter-e2e-test + env: + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + RUN_LLAMA_13B: ${{ env.RUN_LLAMA_13B }} + AI_MODELS_REGISTRY: ${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io + AI_MODELS_REGISTRY_SECRET: ${{ secrets.E2E_AMRT_SECRET_NAME }} + + - name: Cleanup e2e resources + if: ${{ always() }} + uses: azure/CLI@v1.0.9 + with: + inlineScript: | + set +e + az group delete --name "${{ env.CLUSTER_NAME }}" --yes --no-wait || true diff --git a/Makefile b/Makefile index 650ddc285..5321ef913 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,9 @@ GOLANGCI_LINT_BIN := golangci-lint GOLANGCI_LINT := $(abspath $(TOOLS_BIN_DIR)/$(GOLANGCI_LINT_BIN)-$(GOLANGCI_LINT_VER)) E2E_TEST_BIN := e2e.test +KARPENTER_E2E_TEST_BIN := karpenter-e2e.test E2E_TEST := $(BIN_DIR)/$(E2E_TEST_BIN) +KARPENTER_E2E_TEST := $(BIN_DIR)/$(KARPENTER_E2E_TEST_BIN) GINKGO_VER := v2.17.1 GINKGO_BIN := ginkgo @@ -29,10 +31,17 @@ AKS_K8S_VERSION ?= 1.29.2 AZURE_RESOURCE_GROUP ?= demo AZURE_CLUSTER_NAME ?= kaito-demo AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION) -GPU_NAMESPACE ?= gpu-provisioner +GPU_PROVISIONER_NAMESPACE ?= gpu-provisioner KAITO_NAMESPACE ?= kaito-workspace GPU_PROVISIONER_MSI_NAME ?= gpuIdentity +## Karpenter parameters +KARPENTER_NAMESPACE ?= karpenter +KARPENTER_SERVICE_ACCOUNT_NAME ?= karpenter-sa +KARPENTER_VERSION ?= 0.4.0 +AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME ?= karpentermsi +KARPENTER_FEDERATED_IDENTITY_CREDENTIAL_NAME ?= karpenter-fed + RUN_LLAMA_13B ?= false AI_MODELS_REGISTRY ?= modelregistry.azurecr.io AI_MODELS_REGISTRY_SECRET ?= modelregistry @@ -53,37 +62,11 @@ endif $(GOLANGCI_LINT): GOBIN=$(TOOLS_BIN_DIR) $(GO_INSTALL) github.com/golangci/golangci-lint/cmd/golangci-lint $(GOLANGCI_LINT_BIN) $(GOLANGCI_LINT_VER) - $(GINKGO): GOBIN=$(TOOLS_BIN_DIR) $(GO_INSTALL) github.com/onsi/ginkgo/v2/ginkgo $(GINKGO_BIN) $(GINKGO_VER) -# CONTAINER_TOOL defines the container tool to be used for building images. -# Be aware that the target commands are only tested with Docker which is -# scaffolded by default. However, you might want to replace it to use other -# tools. (i.e. podman) -CONTAINER_TOOL ?= docker - -# Setting SHELL to bash allows bash commands to be executed by recipes. -# Options are set to exit when a recipe line exits non-zero or a piped command fails. -SHELL = /usr/bin/env bash -o pipefail -.SHELLFLAGS = -ec - -##@ Development - -.PHONY: manifests -manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases - -.PHONY: generate -generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. - $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." - -.PHONY: fmt -fmt: ## Run go fmt against code. - go fmt ./... - ## -------------------------------------- -## Tests +## Unit Tests ## -------------------------------------- .PHONY: unit-test unit-test: ## Run unit tests. @@ -92,13 +75,14 @@ unit-test: ## Run unit tests. -race -coverprofile=coverage.txt -covermode=atomic go tool cover -func=coverage.txt +## -------------------------------------- +## E2E tests +## -------------------------------------- + inference-api-e2e: pip install -r presets/inference/text-generation/requirements.txt pytest -o log_cli=true -o log_cli_level=INFO . -$(E2E_TEST): - (cd test/e2e && go test -c . -o $(E2E_TEST)) - # Ginkgo configurations GINKGO_FOCUS ?= GINKGO_SKIP ?= @@ -107,12 +91,28 @@ GINKGO_NO_COLOR ?= false GINKGO_TIMEOUT ?= 60m GINKGO_ARGS ?= -focus="$(GINKGO_FOCUS)" -skip="$(GINKGO_SKIP)" -nodes=$(GINKGO_NODES) -no-color=$(GINKGO_NO_COLOR) -timeout=$(GINKGO_TIMEOUT) +$(E2E_TEST): + (cd test/e2e && go test -c . -o $(E2E_TEST)) + .PHONY: kaito-workspace-e2e-test kaito-workspace-e2e-test: $(E2E_TEST) $(GINKGO) AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \ - AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_NAMESPACE=$(GPU_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \ + AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_NAMESPACE=$(GPU_PROVISIONER_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \ $(GINKGO) -v -trace $(GINKGO_ARGS) $(E2E_TEST) +$(KARPENTER_E2E_TEST): + (cd test/e2e/karpenter && go test -c . -o $(KARPENTER_E2E_TEST)) + +.PHONY: kaito-karpenter-e2e-test +kaito-karpenter-e2e-test: $(E2E_TEST) $(GINKGO) + AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \ + AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) KARPENTER=$(KARPENTER_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \ + $(GINKGO) -v -trace $(GINKGO_ARGS) $(KARPENTER_E2E_TEST) + +## -------------------------------------- +## Azure resources +## -------------------------------------- + .PHONY: create-rg create-rg: ## Create resource group az group create --name $(AZURE_RESOURCE_GROUP) --location $(AZURE_LOCATION) -o none @@ -124,47 +124,34 @@ create-acr: ## Create test ACR .PHONY: create-aks-cluster create-aks-cluster: ## Create test AKS cluster (with msi, oidc, and workload identity enabled) - az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --location $(AZURE_LOCATION) \ - --attach-acr $(AZURE_ACR_NAME) --kubernetes-version $(AKS_K8S_VERSION) --node-count 1 --generate-ssh-keys \ + az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) \ + --location $(AZURE_LOCATION) --attach-acr $(AZURE_ACR_NAME) \ + --kubernetes-version $(AKS_K8S_VERSION) --node-count 1 --generate-ssh-keys \ --enable-managed-identity --enable-workload-identity --enable-oidc-issuer -o none + az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --overwrite-existing .PHONY: create-aks-cluster-with-kaito create-aks-cluster-with-kaito: ## Create test AKS cluster (with msi, oidc and kaito enabled) - az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --location $(AZURE_LOCATION) --node-count 1 \ - --generate-ssh-keys --enable-managed-identity --enable-oidc-issuer --enable-ai-toolchain-operator -o none - - az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) - -.PHONY: prepare-kaito-addon-identity -prepare-kaito-addon-identity: - IDENTITY_PRINCIPAL_ID=$(shell az identity show --name "ai-toolchain-operator-$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP_MC)" --query 'principalId');\ - az role assignment create --assignee $$IDENTITY_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP_MC)" --role "Contributor" - - AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP_MC)" --query 'oidcIssuerProfile.issuerUrl');\ - az identity federated-credential create --name gpu-federated-cred --identity-name "ai-toolchain-operator-$(AZURE_CLUSTER_NAME)" \ - -g "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ - --subject system:serviceaccount:"$(KAITO_NAMESPACE):kaito-gpu-provisioner" --audience api://AzureADTokenExchange - -.PHONY: az-patch-install-helm -az-patch-install-helm: ## Update Azure client env vars and settings in helm values.yml - az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) - - yq -i '(.image.repository) = "$(REGISTRY)/workspace"' ./charts/kaito/workspace/values.yaml - yq -i '(.image.tag) = "$(IMG_TAG)"' ./charts/kaito/workspace/values.yaml - - helm install kaito-workspace ./charts/kaito/workspace --namespace $(KAITO_NAMESPACE) --create-namespace + az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) \ + --location $(AZURE_LOCATION) --attach-acr $(AZURE_ACR_NAME) \ + --kubernetes-version $(AKS_K8S_VERSION) --node-count 1 --generate-ssh-keys \ + --enable-managed-identity --enable-workload-identity --enable-oidc-issuer -o none + az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --overwrite-existing -##@ Build +.PHONY: create-aks-cluster-for-karpenter +create-aks-cluster-for-karpenter: ## Create test AKS cluster (with msi, cilium, oidc, and workload identity enabled) + az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) \ + --location $(AZURE_LOCATION) --attach-acr $(AZURE_ACR_NAME) \ + --kubernetes-version $(AKS_K8S_VERSION) --node-count 1 --generate-ssh-keys \ + --network-plugin azure --network-plugin-mode overlay --network-dataplane cilium \ + --enable-managed-identity --enable-oidc-issuer --enable-workload-identity -o none + az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --overwrite-existing -.PHONY: build -build: manifests generate fmt vet ## Build manager binary. - go build -o bin/manager cmd/*.go -.PHONY: run -run: manifests generate fmt vet ## Run a controller from your host. - go run ./cmd/main.go +## -------------------------------------- +## Image Docker Build +## -------------------------------------- -##@ Docker BUILDX_BUILDER_NAME ?= img-builder OUTPUT_TYPE ?= type=registry QEMU_VERSION ?= 5.2.0-2 @@ -187,13 +174,31 @@ docker-build-kaito: docker-buildx --pull \ --tag $(REGISTRY)/$(IMG_NAME):$(IMG_TAG) . -##@ Deployment +## -------------------------------------- +## Kaito Installation +## -------------------------------------- +.PHONY: prepare-kaito-addon-identity +prepare-kaito-addon-identity: + IDENTITY_PRINCIPAL_ID=$(shell az identity show --name "ai-toolchain-operator-$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP_MC)" --query 'principalId');\ + az role assignment create --assignee $$IDENTITY_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP_MC)" --role "Contributor" -ifndef ignore-not-found - ignore-not-found = false -endif + AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP_MC)" --query 'oidcIssuerProfile.issuerUrl');\ + az identity federated-credential create --name gpu-federated-cred --identity-name "ai-toolchain-operator-$(AZURE_CLUSTER_NAME)" \ + -g "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ + --subject system:serviceaccount:"$(KAITO_NAMESPACE):kaito-gpu-provisioner" --audience api://AzureADTokenExchange + +.PHONY: az-patch-install-helm +az-patch-install-helm: ## Update Azure client env vars and settings in helm values.yml + az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) + + yq -i '(.image.repository) = "$(REGISTRY)/workspace"' ./charts/kaito/workspace/values.yaml + yq -i '(.image.tag) = "$(IMG_TAG)"' ./charts/kaito/workspace/values.yaml + + helm install kaito-workspace ./charts/kaito/workspace --namespace $(KAITO_NAMESPACE) --create-namespace -##@ gpu-provider +## -------------------------------------- +## gpu-provider installation +## -------------------------------------- gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner az identity create --name $(GPU_PROVISIONER_MSI_NAME) --resource-group $(AZURE_RESOURCE_GROUP) @@ -202,7 +207,7 @@ gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\ az identity federated-credential create --name gpu-federatecredential --identity-name $(GPU_PROVISIONER_MSI_NAME) --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ - --subject system:serviceaccount:"$(GPU_NAMESPACE):$(GPU_NAMESPACE)" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) + --subject system:serviceaccount:"$(GPU_PROVISIONER_NAMESPACE):$(GPU_PROVISIONER_NAMESPACE)" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) .PHONY: gpu-provisioner-helm gpu-provisioner-helm: ## Update Azure client env vars and settings in helm values.yml @@ -211,16 +216,78 @@ gpu-provisioner-helm: ## Update Azure client env vars and settings in helm valu curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME) - helm install $(GPU_NAMESPACE) --values gpu-provisioner-values.yaml --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) --wait \ + helm install $(GPU_PROVISIONER_NAMESPACE) --values gpu-provisioner-values.yaml --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) --wait \ https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz -##@ Build Dependencies +## -------------------------------------- +## Azure Karpenter Installation +## -------------------------------------- +karpenter-identity-perm: + az identity create --name $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) --resource-group $(AZURE_RESOURCE_GROUP) + + KARPENTER_USER_ASSIGNED_PRINCIPAL_ID=$(shell az identity show -n "$(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME)" \ + -g "$(AZURE_RESOURCE_GROUP)" --query 'principalId');\ + az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Virtual Machine Contributor";\ + az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Network Contributor";\ + az role assignment create --assignee $$KARPENTER_USER_ASSIGNED_PRINCIPAL_ID --scope "/subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP)" --role "Managed Identity Operator" + + AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\ + az identity federated-credential create --name $(KARPENTER_FEDERATED_IDENTITY_CREDENTIAL_NAME) \ + --identity-name $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) \ + --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ + --subject system:serviceaccount:"$(KARPENTER_NAMESPACE):$(KARPENTER_SERVICE_ACCOUNT_NAME)" \ + --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) + +.PHONY: azure-karpenter-helm +azure-karpenter-helm: ## Update Azure client env vars and settings in helm values.yml + az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) + curl -sO https://raw.githubusercontent.com/Azure/karpenter-provider-azure/main/hack/deploy/configure-values.sh + chmod +x ./configure-values.sh && ./configure-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) \ + $(KARPENTER_SERVICE_ACCOUNT_NAME) $(AZURE_KARPENTER_USER_ASSIGNED_IDENTITY_NAME) + + helm upgrade --install karpenter oci://mcr.microsoft.com/aks/karpenter/karpenter \ + --version "$(KARPENTER_VERSION)" \ + --namespace "$(KARPENTER_NAMESPACE)" --create-namespace \ + --values karpenter-values.yaml \ + --set controller.resources.requests.cpu=1 \ + --set controller.resources.requests.memory=1Gi \ + --set controller.resources.limits.cpu=1 \ + --set controller.resources.limits.memory=1Gi \ + --wait + + kubectl logs -f -n "$(KARPENTER_NAMESPACE)" -l app.kubernetes.io/name=karpenter -c controller + +##@ Development +.PHONY: manifests +manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. + $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + +.PHONY: generate +generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + + +##@ Build +.PHONY: build +build: manifests generate fmt vet ## Build manager binary. + go build -o bin/manager cmd/*.go + +.PHONY: run +run: manifests generate fmt vet ## Run a controller from your host. + go run ./cmd/main.go + +##@ Build Dependencies ## Location to install dependencies to LOCALBIN ?= $(shell pwd)/bin $(LOCALBIN): mkdir -p $(LOCALBIN) +##@ Deployment +ifndef ignore-not-found + ignore-not-found = false +endif + ## Tool Binaries KUBECTL ?= kubectl CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen @@ -252,9 +319,13 @@ vet: ## Run go vet against code. lint: $(GOLANGCI_LINT) $(GOLANGCI_LINT) run -v +.PHONY: fmt +fmt: ## Run go fmt against code. + go fmt ./... + ## -------------------------------------- ## Release -## To create a release, run `make release VERSION=x.y.z` +## To create a release, run `make release VERSION=vx.y.z` ## -------------------------------------- .PHONY: release-manifest release-manifest: