From 6f3b30e396a418dd7bf3f5503bca30ed1d162967 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 3 Dec 2024 15:57:24 -0800 Subject: [PATCH 1/2] feat: ragengine workflow --- .github/workflows/e2e-ragengine.yml | 228 ++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 .github/workflows/e2e-ragengine.yml diff --git a/.github/workflows/e2e-ragengine.yml b/.github/workflows/e2e-ragengine.yml new file mode 100644 index 000000000..aaa513a43 --- /dev/null +++ b/.github/workflows/e2e-ragengine.yml @@ -0,0 +1,228 @@ +name: ragengine-e2e-workflow + +on: + workflow_call: + inputs: + git_sha: + type: string + required: true + node_provisioner: + type: string + required: false + default: gpuprovisioner + tag: + type: string + isRelease: + type: boolean + default: false + registry: + type: string + region: + type: string + description: "the azure location to run the e2e test in" + default: "eastus" + k8s_version: + type: string + default: "1.30.0" + +jobs: + e2e-tests: + runs-on: [ "self-hosted", "hostname:kaito-e2e-github-runner" ] + name: e2e-tests-ragengine + permissions: + contents: read + id-token: write # This is required for requesting the JWT + environment: e2e-test + env: + GO_VERSION: "1.22" + KARPENTER_NAMESPACE: "karpenter" + GPU_PROVISIONER_NAMESPACE: "gpu-provisioner" + + steps: + - name: Harden Runner + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Set e2e Resource and Cluster Name + run: | + rand=$(git rev-parse --short ${{ inputs.git_sha }}) + + if [ "$rand" = "" ]; then + rand=$RANDOM + fi + + echo "VERSION=${rand}" >> $GITHUB_ENV + echo "CLUSTER_NAME=${{ inputs.node_provisioner }}${rand}" >> $GITHUB_ENV + echo "REGISTRY=${{ inputs.node_provisioner }}${rand}.azurecr.io" >> $GITHUB_ENV + echo "RUN_LLAMA_13B=false" >> $GITHUB_ENV + + - name: Remove existing Go modules directory + run: sudo rm -rf ~/go/pkg/mod + + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@v5.1.0 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install Azure CLI latest + run: | + if ! which az > /dev/null; then + echo "Azure CLI not found. Installing..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + else + echo "Azure CLI already installed." + fi + + - name: Azure CLI Login + run: | + az login --identity + + - uses: azure/setup-helm@v4 + id: install + + - name: Generate APIs + run: | + make generate + + - name: build ragengine image + # if: ${{ !inputs.isRelease }} + shell: bash + run: | + make + # env: + # REGISTRY: ${{ env.REGISTRY }} + # VERSION: ${{ env.VERSION }} + + - name: create cluster + shell: bash + run: | + if [ "${{ inputs.node_provisioner }}" == "gpuprovisioner" ]; then + make create-aks-cluster + else + make create-aks-cluster-for-karpenter + fi + env: + AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + AZURE_LOCATION: ${{ inputs.region }} + AKS_K8S_VERSION: ${{ inputs.k8s_version }} + + - name: Create Identities and Permissions for ${{ inputs.node_provisioner }} + shell: bash + run: | + AZURE_SUBSCRIPTION_ID=$E2E_SUBSCRIPTION_ID \ + make generate-identities + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + TEST_SUITE: ${{ inputs.node_provisioner }} + + - name: Install gpu-provisioner helm chart + if: ${{ inputs.node_provisioner == 'gpuprovisioner' }} + shell: bash + run: | + AZURE_TENANT_ID=$E2E_TENANT_ID \ + AZURE_SUBSCRIPTION_ID=$E2E_SUBSCRIPTION_ID \ + make gpu-provisioner-helm + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + + - name: Install karpenter Azure provider helm chart + if: ${{ inputs.node_provisioner == 'azkarpenter' }} + shell: bash + run: | + AZURE_TENANT_ID=$E2E_TENANT_ID \ + AZURE_SUBSCRIPTION_ID=$E2E_SUBSCRIPTION_ID \ + make azure-karpenter-helm + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }} + KARPENTER_NAMESPACE: ${{ env.KARPENTER_NAMESPACE }} + + - name: Install ragengine Workspace helm chart + shell: bash + run: | + make + kubectl wait --for=condition=available deploy "ragengine-workspace" -n ragengine-workspace --timeout=300s + env: + AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + REGISTRY: ${{ env.REGISTRY }} + VERSION: ${{ env.VERSION }} + TEST_SUITE: ${{ inputs.node_provisioner }} + + # Retrieve E2E ACR credentials and create Kubernetes secret + - name: Set up E2E ACR Credentials and Secret + shell: bash + run: | + # Retrieve the ACR username and password + ACR_USERNAME=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "username" -o tsv) + ACR_PASSWORD=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "passwords[0].value" -o tsv) + + # Ensure credentials were retrieved successfully + if [ -z "$ACR_USERNAME" ] || [ -z "$ACR_PASSWORD" ]; then + echo "Failed to retrieve ACR credentials" + exit 1 + fi + + # Create the Kubernetes secret with the retrieved credentials + kubectl create secret docker-registry ${{ env.CLUSTER_NAME }}-acr-secret \ + --docker-server=${{ env.CLUSTER_NAME }}.azurecr.io \ + --docker-username=${ACR_USERNAME} \ + --docker-password=${ACR_PASSWORD} + + # Add Private-Hosted ACR secret for private models like llama + - name: Add Private-Hosted ACR Secret Credentials + run: | + # Ensure E2E_AMRT_SECRET_NAME is sanitized to remove any accidental quotes + E2E_AMRT_SECRET_NAME=$(echo "$E2E_AMRT_SECRET_NAME" | sed 's/[\"'\'']//g') + + if kubectl get secret "$E2E_AMRT_SECRET_NAME" >/dev/null 2>&1; then + echo "Secret $E2E_AMRT_SECRET_NAME already exists. Skipping creation." + else + kubectl create secret docker-registry "$E2E_AMRT_SECRET_NAME" \ + --docker-server="$E2E_ACR_AMRT_USERNAME.azurecr.io" \ + --docker-username="$E2E_ACR_AMRT_USERNAME" \ + --docker-password="$E2E_ACR_AMRT_PASSWORD" + echo "Secret $E2E_AMRT_SECRET_NAME created successfully." + fi + + - name: Log ${{ inputs.node_provisioner }} + run: | + if [ "${{ inputs.node_provisioner }}" == "gpuprovisioner" ]; then + kubectl logs -n "${{ env.GPU_PROVISIONER_NAMESPACE }}" -l app.kubernetes.io/name=gpu-provisioner -c controller + else + kubectl logs -n "${{ env.KARPENTER_NAMESPACE }}" -l app.kubernetes.io/name=karpenter -c controller + fi + + - name: Log ragengine-workspace + run: | + kubectl get pods -n ragengine-workspace -o name | grep "^pod/ragengine-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n ragengine-workspace {} + + - name: Run e2e test + run: | + make + env: + AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + RUN_LLAMA_13B: ${{ env.RUN_LLAMA_13B }} + REGISTRY: ${{ env.REGISTRY }} + TEST_SUITE: ${{ inputs.node_provisioner }} + E2E_ACR_REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io + E2E_ACR_REGISTRY_SECRET: ${{ env.CLUSTER_NAME }}-acr-secret + + - name: Cleanup e2e resources + if: ${{ always() }} + uses: azure/CLI@v2.1.0 + with: + inlineScript: | + set +e + az group delete --name "${{ env.CLUSTER_NAME }}" --yes --no-wait || true From 27b3836886071a1d4fb571cbe6b044afc059540c Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Tue, 3 Dec 2024 16:02:52 -0800 Subject: [PATCH 2/2] feat: add ragengine e2e --- ...gengine.yml => e2e-ragengine-workflow.yml} | 0 .github/workflows/ragengine-e2e.yml | 31 +++++++++++++++++++ 2 files changed, 31 insertions(+) rename .github/workflows/{e2e-ragengine.yml => e2e-ragengine-workflow.yml} (100%) create mode 100644 .github/workflows/ragengine-e2e.yml diff --git a/.github/workflows/e2e-ragengine.yml b/.github/workflows/e2e-ragengine-workflow.yml similarity index 100% rename from .github/workflows/e2e-ragengine.yml rename to .github/workflows/e2e-ragengine-workflow.yml diff --git a/.github/workflows/ragengine-e2e.yml b/.github/workflows/ragengine-e2e.yml new file mode 100644 index 000000000..77d388f29 --- /dev/null +++ b/.github/workflows/ragengine-e2e.yml @@ -0,0 +1,31 @@ +name: pr-e2e-test + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +on: + pull_request: + paths-ignore: ['docs/**', '**.md', '**.mdx', '**.png', '**.jpg'] + +env: + GO_VERSION: "1.22" + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + run-e2e: + strategy: + fail-fast: false + matrix: + node-provisioner: [gpuprovisioner] # WIP: azkarpenter] + permissions: + contents: read + id-token: write + statuses: write + uses: ./.github/workflows/e2e-ragengine-workflow.yml + with: + git_sha: ${{ github.event.pull_request.head.sha }} + node_provisioner: ${{ matrix.node-provisioner }}