diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 600a98c2a..9092679b7 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,3 +6,43 @@ updates: interval: "weekly" commit-message: prefix: "chore" + + - package-ecosystem: docker + directory: /docker/kaito + schedule: + interval: daily + + - package-ecosystem: docker + directory: /docker/presets/inference/llama-2 + schedule: + interval: daily + + - package-ecosystem: docker + directory: /docker/presets/inference/tfs-onnx + schedule: + interval: daily + + - package-ecosystem: docker + directory: /docker/presets/inference/tfs + schedule: + interval: daily + + - package-ecosystem: docker + directory: /docker/presets/tuning + schedule: + interval: daily + + - package-ecosystem: gomod + directory: / + schedule: + interval: daily + + - package-ecosystem: pip + directory: /presets/inference/text-generation + schedule: + interval: daily + + - package-ecosystem: pip + directory: /presets/tuning/tfs + schedule: + interval: daily diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 1651267c0..9343379b8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -7,6 +7,9 @@ on: schedule: - cron: "0 7 * * 1" # Mondays at 7:00 AM +permissions: + contents: read + jobs: analyze: name: Analyze @@ -17,6 +20,11 @@ jobs: actions: read steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + - name: Checkout repository uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: @@ -33,4 +41,3 @@ jobs: - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@cdcdbb579706841c47f7063dda365e292e5cad7a - diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml index 94ab2953b..95f6b98b4 100644 --- a/.github/workflows/create-release.yml +++ b/.github/workflows/create-release.yml @@ -21,7 +21,7 @@ jobs: egress-policy: audit - name: Set up Go ${{ env.GO_VERSION }} - uses: actions/setup-go@v5 + uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0 with: go-version: ${{ env.GO_VERSION }} @@ -33,7 +33,7 @@ jobs: ref: ${{ github.event.client_payload.tag }} - name: Goreleaser - uses: goreleaser/goreleaser-action@v5 + uses: goreleaser/goreleaser-action@7ec5c2b0c6cdda6e8bbb49444bc797dd33d74dd8 # v5.0.0 with: version: latest args: release --rm-dist --timeout 60m --debug diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml new file mode 100644 index 000000000..3f3456223 --- /dev/null +++ b/.github/workflows/dependency-review.yml @@ -0,0 +1,27 @@ +# Dependency Review Action +# +# This Action will scan dependency manifest files that change as part of a Pull Request, +# surfacing known-vulnerable versions of the packages declared or updated in the PR. +# Once installed, if the workflow run is marked as required, +# PRs introducing known-vulnerable packages will be blocked from merging. +# +# Source repository: https://github.com/actions/dependency-review-action +name: 'Dependency Review' +on: [pull_request] + +permissions: + contents: read + +jobs: + dependency-review: + runs-on: ubuntu-latest + steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + + - name: 'Checkout Repository' + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + - name: 'Dependency Review' + uses: actions/dependency-review-action@0efb1d1d84fc9633afcdaad14c485cbbc90ef46c # v2.5.1 diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index cdd7e5746..6c3a2f112 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -1,30 +1,30 @@ name: E2E Preset Test on: - workflow_run: - workflows: ["Build and Push Preset Models"] - types: - - completed - workflow_dispatch: - inputs: - force-run-all: - type: boolean - default: false - description: "Test all models for E2E" - force-update-all: - type: boolean - default: false - description: "Force update existing images in Prod ACR" + workflow_run: + workflows: ["Build and Push Preset Models"] + types: + - completed + workflow_dispatch: + inputs: + force-run-all: + type: boolean + default: false + description: "Test all models for E2E" + force-update-all: + type: boolean + default: false + description: "Force update existing images in Prod ACR" env: - GO_VERSION: "1.22" - BRANCH_NAME: ${{ github.head_ref || github.ref_name}} - FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }} - FORCE_UPDATE_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-update-all == 'true' }} + GO_VERSION: "1.22" + BRANCH_NAME: ${{ github.head_ref || github.ref_name}} + FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }} + FORCE_UPDATE_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-update-all == 'true' }} permissions: - id-token: write - contents: read + id-token: write + contents: read jobs: determine-models: @@ -36,60 +36,65 @@ jobs: is_matrix_empty: ${{ steps.check_matrix_empty.outputs.is_empty }} full_matrix: ${{ steps.images.outputs.full_matrix }} steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - submodules: true - fetch-depth: 0 + submodules: true + fetch-depth: 0 # This script should output a JSON array of model names - name: Determine Affected Models id: affected_models run: | - PR_BRANCH=${{ env.BRANCH_NAME }} \ - FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ - python3 .github/workflows/kind-cluster/determine_models.py + PR_BRANCH=${{ env.BRANCH_NAME }} \ + FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ + python3 .github/workflows/kind-cluster/determine_models.py - name: Print Determined Models run: | - echo "Output from determine_models: ${{ steps.affected_models.outputs.matrix }}" - + echo "Output from determine_models: ${{ steps.affected_models.outputs.matrix }}" + - name: Check if Matrix is Empty id: check_matrix_empty run: | - if [ "${{ steps.affected_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.affected_models.outputs.matrix }}" ]; then - echo "is_empty=true" >> $GITHUB_OUTPUT - else - echo "is_empty=false" >> $GITHUB_OUTPUT - fi - + if [ "${{ steps.affected_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.affected_models.outputs.matrix }}" ]; then + echo "is_empty=true" >> $GITHUB_OUTPUT + else + echo "is_empty=false" >> $GITHUB_OUTPUT + fi + - name: Add Config info for Testing if: steps.check_matrix_empty.outputs.is_empty == 'false' id: images - run: | - # Read the additional configurations from e2e-preset-configs.json - CONFIGS=$(cat .github/e2e-preset-configs.json | jq -c '.matrix.image') - - # Pseudocode for combining matrices - # COMBINED_MATRIX = [] - # for model in MATRIX: - # for config in CONFIGS: - # if config['name'] == model['name']: - # combined = {**model, **config} - # COMBINED_MATRIX.append(combined) - # break - - COMBINED_MATRIX=$(echo '${{ steps.affected_models.outputs.matrix }}' | jq --argjson configs "$CONFIGS" -c ' - map(. as $model | $configs[] | select(.name == $model.name) | $model + .) - ') - - echo "full_matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT - + run: | + # Read the additional configurations from e2e-preset-configs.json + CONFIGS=$(cat .github/e2e-preset-configs.json | jq -c '.matrix.image') + + # Pseudocode for combining matrices + # COMBINED_MATRIX = [] + # for model in MATRIX: + # for config in CONFIGS: + # if config['name'] == model['name']: + # combined = {**model, **config} + # COMBINED_MATRIX.append(combined) + # break + + COMBINED_MATRIX=$(echo '${{ steps.affected_models.outputs.matrix }}' | jq --argjson configs "$CONFIGS" -c ' + map(. as $model | $configs[] | select(.name == $model.name) | $model + .) + ') + + echo "full_matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT + - name: Print Combined Matrix if: steps.check_matrix_empty.outputs.is_empty == 'false' run: | - echo "Combined Matrix:" - echo '${{ steps.images.outputs.full_matrix }}' + echo "Combined Matrix:" + echo '${{ steps.images.outputs.full_matrix }}' e2e-preset-tests: needs: determine-models @@ -105,314 +110,318 @@ jobs: # "node-vm-size":"Standard_NC96ads_A100_v4", "node-osdisk-size":400} model: ${{fromJson(needs.determine-models.outputs.full_matrix)}} steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - submodules: true - fetch-depth: 0 + submodules: true + fetch-depth: 0 - name: Set OSS Flag run: echo "MODEL_IS_OSS=${{ matrix.model.OSS }}" >> $GITHUB_ENV - - name: 'Az CLI login' - uses: azure/login@v1.6.1 + - name: "Az CLI login" + uses: azure/login@cb79c773a3cfa27f31f25eb3f677781210c9ce3d # v1.6.1 with: - client-id: ${{ secrets.AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - allow-no-subscriptions: true - - - name: 'Set Prod Subscription' + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + allow-no-subscriptions: true + + - name: "Set Prod Subscription" run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} - - - name: 'Check if Image exists in Prod ACR' + + - name: "Check if Image exists in Prod ACR" id: check_prod_image run: | - ACR_NAME=${{ secrets.PROD_ACR_USERNAME }} - IMAGE_NAME=unlisted/aks/kaito/kaito-${{ matrix.model.name }} - TAG=${{ matrix.model.tag }} - - # Use '|| true' to prevent script from exiting with an error if the repository is not found - TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) - - if [[ -z "$TAGS" ]]; then - echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." - echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT - else - if echo "$TAGS" | grep -q "^$TAG$"; then - echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT - else - echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT - echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." - fi - fi - - - name: 'Set Test Subscription' + ACR_NAME=${{ secrets.PROD_ACR_USERNAME }} + IMAGE_NAME=unlisted/aks/kaito/kaito-${{ matrix.model.name }} + TAG=${{ matrix.model.tag }} + + # Use '|| true' to prevent script from exiting with an error if the repository is not found + TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) + + if [[ -z "$TAGS" ]]; then + echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + else + if echo "$TAGS" | grep -q "^$TAG$"; then + echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT + else + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." + fi + fi + + - name: "Set Test Subscription" run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} - - name: 'Check if Image exists in Test ACR' + - name: "Check if Image exists in Test ACR" id: check_test_image run: | - ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} - IMAGE_NAME=${{ matrix.model.name }} - TAG=${{ matrix.model.tag }} - - # Use '|| true' to prevent script from exiting with an error if the repository is not found - TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) - - if [[ -z "$TAGS" ]]; then - echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." - echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT - else - if echo "$TAGS" | grep -q "^$TAG$"; then - echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT - else - echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT - echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." - fi - fi - + ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} + IMAGE_NAME=${{ matrix.model.name }} + TAG=${{ matrix.model.tag }} + + # Use '|| true' to prevent script from exiting with an error if the repository is not found + TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) + + if [[ -z "$TAGS" ]]; then + echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + else + if echo "$TAGS" | grep -q "^$TAG$"; then + echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT + else + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." + fi + fi + - name: Check if Image is Test and Prod ACRs if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'true' run: | - echo "Image already exists in both Test and Prod ACRs, remember to bump tag" + echo "Image already exists in both Test and Prod ACRs, remember to bump tag" - name: Set up kubectl context if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | az aks get-credentials --resource-group llm-test --name GitRunner - + - name: Get Nodepool Name if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') id: get_nodepool_name run: | - NAME_SUFFIX=${{ matrix.model.name }} - NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/} # Removing all '-' symbols - - if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then - TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES: -12} - else - TRUNCATED_NAME_SUFFIX=$NAME_SUFFIX_WITHOUT_DASHES - fi - echo "Nodepool Name: $TRUNCATED_NAME_SUFFIX" - echo "NODEPOOL_NAME=$TRUNCATED_NAME_SUFFIX" >> $GITHUB_OUTPUT + NAME_SUFFIX=${{ matrix.model.name }} + NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/} # Removing all '-' symbols + + if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then + TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES: -12} + else + TRUNCATED_NAME_SUFFIX=$NAME_SUFFIX_WITHOUT_DASHES + fi + echo "Nodepool Name: $TRUNCATED_NAME_SUFFIX" + echo "NODEPOOL_NAME=$TRUNCATED_NAME_SUFFIX" >> $GITHUB_OUTPUT - name: Create Nodepool if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | - NODEPOOL_EXIST=$(az aks nodepool show \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name GitRunner \ - --resource-group llm-test \ - --query 'name' -o tsv || echo "") - echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" - if [ -z "$NODEPOOL_EXIST" ]; then - az aks nodepool add \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name GitRunner \ - --resource-group llm-test \ - --node-count ${{ matrix.model.node-count }} \ - --node-vm-size ${{ matrix.model.node-vm-size }} \ - --node-osdisk-size ${{ matrix.model.node-osdisk-size }} \ - --labels pool=${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --node-taints sku=gpu:NoSchedule \ - --aks-custom-headers UseGPUDedicatedVHD=true - else - NODEPOOL_STATE=$(az aks nodepool show \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name GitRunner \ - --resource-group llm-test \ - --query 'provisioningState' -o tsv) - echo "NODEPOOL_STATE: $NODEPOOL_STATE" - if [ "$NODEPOOL_STATE" != "Succeeded" ]; then - echo "Nodepool exists but is not in a Succeeded state. Please check manually." - exit 1 - else - echo "Nodepool already exists and is in a running state." - fi - fi + NODEPOOL_EXIST=$(az aks nodepool show \ + --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --query 'name' -o tsv || echo "") + echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" + if [ -z "$NODEPOOL_EXIST" ]; then + az aks nodepool add \ + --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --node-count ${{ matrix.model.node-count }} \ + --node-vm-size ${{ matrix.model.node-vm-size }} \ + --node-osdisk-size ${{ matrix.model.node-osdisk-size }} \ + --labels pool=${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --node-taints sku=gpu:NoSchedule \ + --aks-custom-headers UseGPUDedicatedVHD=true + else + NODEPOOL_STATE=$(az aks nodepool show \ + --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --query 'provisioningState' -o tsv) + echo "NODEPOOL_STATE: $NODEPOOL_STATE" + if [ "$NODEPOOL_STATE" != "Succeeded" ]; then + echo "Nodepool exists but is not in a Succeeded state. Please check manually." + exit 1 + else + echo "Nodepool already exists and is in a running state." + fi + fi - name: Create Service if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml - + - name: Retrieve External Service IP if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') id: get_ip run: | - while [[ -z $SERVICE_IP ]]; do - SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') - sleep 5 - done - echo "Service IP is $SERVICE_IP" - echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT + while [[ -z $SERVICE_IP ]]; do + SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') + sleep 5 + done + echo "Service IP is $SERVICE_IP" + echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT - name: Get Resource Type id: resource run: | - RESOURCE_TYPE=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "statefulset" || echo "deployment") - echo "RESOURCE_TYPE=$RESOURCE_TYPE" >> $GITHUB_OUTPUT - + RESOURCE_TYPE=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "statefulset" || echo "deployment") + echo "RESOURCE_TYPE=$RESOURCE_TYPE" >> $GITHUB_OUTPUT + - name: Replace IP and Deploy Resource to K8s if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | - sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml - sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml - sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml - kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml - name: Wait for Resource to be ready if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | - kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s - + kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s + - name: Test home endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | - curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/ + curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/ - name: Test healthz endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | - curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz - + curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz + - name: Test inference endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true') run: | - if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then - echo "Testing inference for ${{ matrix.model.name }}" - curl -X POST \ - -H "Content-Type: application/json" \ - -d '{ - "input_data": { - "input_string": [ - [ - { - "role": "system", - "content": "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe." - }, - { - "role": "user", - "content": "Write a brief birthday message to John" - } - ] - ] - } - }' \ - http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat - elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then - echo "Testing inference for ${{ matrix.model.name }}" - curl -X POST \ - -H "Content-Type: application/json" \ - -d '{ - "prompts": [ - "I believe the meaning of life is", - "Simply put, the theory of relativity states that ", - "A brief message congratulating the team on the launch: Hi everyone, I just ", - "Translate English to French: sea otter => loutre de mer, peppermint => menthe poivrée, plush girafe => girafe peluche, cheese =>" - ], - "parameters": { - "max_gen_len": 128 - } - }' \ - http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate - else - echo "Testing inference for ${{ matrix.model.name }}" - curl -X POST \ - -H "accept: application/json" \ - -H "Content-Type: application/json" \ - -d '{ - "prompt":"Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:", - "return_full_text": false, - "clean_up_tokenization_spaces": false, - "prefix": null, - "handle_long_generation": null, - "generate_kwargs": { - "max_length":200, - "min_length":0, - "do_sample":true, - "early_stopping":false, - "num_beams":1, - "num_beam_groups":1, - "diversity_penalty":0.0, - "temperature":1.0, - "top_k":10, - "top_p":1, - "typical_p":1, - "repetition_penalty":1, - "length_penalty":1, - "no_repeat_ngram_size":0, - "encoder_no_repeat_ngram_size":0, - "bad_words_ids":null, - "num_return_sequences":1, - "output_scores":false, - "return_dict_in_generate":false, - "forced_bos_token_id":null, - "forced_eos_token_id":null, - "remove_invalid_values":null - } - }' \ - http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat - fi - + if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then + echo "Testing inference for ${{ matrix.model.name }}" + curl -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "input_data": { + "input_string": [ + [ + { + "role": "system", + "content": "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe." + }, + { + "role": "user", + "content": "Write a brief birthday message to John" + } + ] + ] + } + }' \ + http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat + elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then + echo "Testing inference for ${{ matrix.model.name }}" + curl -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "prompts": [ + "I believe the meaning of life is", + "Simply put, the theory of relativity states that ", + "A brief message congratulating the team on the launch: Hi everyone, I just ", + "Translate English to French: sea otter => loutre de mer, peppermint => menthe poivrée, plush girafe => girafe peluche, cheese =>" + ], + "parameters": { + "max_gen_len": 128 + } + }' \ + http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate + else + echo "Testing inference for ${{ matrix.model.name }}" + curl -X POST \ + -H "accept: application/json" \ + -H "Content-Type: application/json" \ + -d '{ + "prompt":"Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:", + "return_full_text": false, + "clean_up_tokenization_spaces": false, + "prefix": null, + "handle_long_generation": null, + "generate_kwargs": { + "max_length":200, + "min_length":0, + "do_sample":true, + "early_stopping":false, + "num_beams":1, + "num_beam_groups":1, + "diversity_penalty":0.0, + "temperature":1.0, + "top_k":10, + "top_p":1, + "typical_p":1, + "repetition_penalty":1, + "length_penalty":1, + "no_repeat_ngram_size":0, + "encoder_no_repeat_ngram_size":0, + "bad_words_ids":null, + "num_return_sequences":1, + "output_scores":false, + "return_dict_in_generate":false, + "forced_bos_token_id":null, + "forced_eos_token_id":null, + "remove_invalid_values":null + } + }' \ + http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat + fi + - name: Move from Test to Prod ACR if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_UPDATE_ALL== 'true') && github.event_name == 'workflow_dispatch' && env.MODEL_IS_OSS == 'true' run: | - # This should only run if: - # 1. All prior steps have succeeed (Given) - # 2. Image exists in test ACR repo but not Prod - # 3. Workflow was triggered manually (workflow_dispatch) - # 4. Image is OSS (MIT/Apache2.0) - - az account set --subscription ${{secrets.PROD_ACR_SUB_ID}} - - TEST_ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} - PROD_ACR_NAME=${{ secrets.PROD_ACR_USERNAME }} - IMAGE_NAME=${{ matrix.model.name }} - TAG=${{ matrix.model.tag }} - - # Formulate the source image reference - SOURCE_IMAGE=$IMAGE_NAME:$TAG - DEST_IMAGE=unlisted/aks/kaito/kaito-$IMAGE_NAME:$TAG - - # Import the image from Test ACR to Prod ACR - az acr import \ - --name $PROD_ACR_NAME \ - --source $SOURCE_IMAGE \ - --image $DEST_IMAGE \ - --registry /subscriptions/${{secrets.AZURE_SUBSCRIPTION_ID}}/resourceGroups/${{secrets.TEST_ACR_RG}}/providers/Microsoft.ContainerRegistry/registries/$TEST_ACR_NAME - + # This should only run if: + # 1. All prior steps have succeeed (Given) + # 2. Image exists in test ACR repo but not Prod + # 3. Workflow was triggered manually (workflow_dispatch) + # 4. Image is OSS (MIT/Apache2.0) + + az account set --subscription ${{secrets.PROD_ACR_SUB_ID}} + + TEST_ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} + PROD_ACR_NAME=${{ secrets.PROD_ACR_USERNAME }} + IMAGE_NAME=${{ matrix.model.name }} + TAG=${{ matrix.model.tag }} + + # Formulate the source image reference + SOURCE_IMAGE=$IMAGE_NAME:$TAG + DEST_IMAGE=unlisted/aks/kaito/kaito-$IMAGE_NAME:$TAG + + # Import the image from Test ACR to Prod ACR + az acr import \ + --name $PROD_ACR_NAME \ + --source $SOURCE_IMAGE \ + --image $DEST_IMAGE \ + --registry /subscriptions/${{secrets.AZURE_SUBSCRIPTION_ID}}/resourceGroups/${{secrets.TEST_ACR_RG}}/providers/Microsoft.ContainerRegistry/registries/$TEST_ACR_NAME + - name: Cleanup if: always() run: | - # Only proceed if RESOURCE_TYPE is set (else resource wasn't created) - if [ -n "${{ steps.resource.outputs.RESOURCE_TYPE }}" ]; then - # Use RESOURCE_TYPE from the previous step - RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }} - - # Check and Delete K8s Resource (Deployment or StatefulSet) - if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then - kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }} - fi - fi - - # Check and Delete K8s Service if it exists - if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then - kubectl delete svc ${{ matrix.model.name }} - fi - - # Check and Delete AKS Nodepool if it exists - if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then - NODEPOOL_EXIST=$(az aks nodepool show \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name GitRunner \ - --resource-group llm-test \ - --query 'name' -o tsv || echo "") - - if [ -n "$NODEPOOL_EXIST" ]; then - az aks nodepool delete \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name GitRunner \ - --resource-group llm-test - fi - fi - + # Only proceed if RESOURCE_TYPE is set (else resource wasn't created) + if [ -n "${{ steps.resource.outputs.RESOURCE_TYPE }}" ]; then + # Use RESOURCE_TYPE from the previous step + RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }} + + # Check and Delete K8s Resource (Deployment or StatefulSet) + if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then + kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }} + fi + fi + + # Check and Delete K8s Service if it exists + if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then + kubectl delete svc ${{ matrix.model.name }} + fi + + # Check and Delete AKS Nodepool if it exists + if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then + NODEPOOL_EXIST=$(az aks nodepool show \ + --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --query 'name' -o tsv || echo "") + + if [ -n "$NODEPOOL_EXIST" ]; then + az aks nodepool delete \ + --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test + fi + fi diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml index a271fa0c5..8e6702ddf 100644 --- a/.github/workflows/e2e-workflow.yml +++ b/.github/workflows/e2e-workflow.yml @@ -1,66 +1,43 @@ -name: kaito-e2e-workflow +name: e2e-test + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true on: - workflow_call: - inputs: - git_sha: - type: string - required: true - tag: - type: string - isRelease: - type: boolean - default: false - registry: - type: string - region: - type: string - description: "the azure location to run the e2e test in" - default: "eastus" - k8s_version: - type: string - default: "1.27" - secrets: - E2E_CLIENT_ID: - required: true - E2E_TENANT_ID: - required: true - E2E_SUBSCRIPTION_ID: - required: true - E2E_AMRT_SECRET_NAME: - required: true - E2E_ACR_AMRT_USERNAME: - required: true - E2E_ACR_AMRT_PASSWORD: - required: true + push: + branches: [main] + paths-ignore: ["docs/**", "**.md", "**.mdx", "**.png", "**.jpg"] + pull_request: + branches: [main] + paths-ignore: ["docs/**", "**.md", "**.mdx", "**.png", "**.jpg"] + repository_dispatch: + types: [release-tag] + branches: [release-**] + +env: + GO_VERSION: "1.20" permissions: + id-token: write # This is required for requesting the JWT contents: read # This is required for actions/checkout jobs: e2e-tests: runs-on: ubuntu-latest - permissions: - contents: read - id-token: write # This is required for requesting the JWT environment: e2e-test env: GO_VERSION: "1.22" steps: - - name: Harden Runner - uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 - with: - egress-policy: audit - - - name: Checkout - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - with: - ref: ${{ inputs.git_sha }} + - name: Shorten SHA + if: ${{ !github.event.client_payload.isRelease }} + id: vars + run: echo "pr_sha_short=$(git rev-parse --short ${{ github.event.pull_request.head.sha }})" >> $GITHUB_OUTPUT - name: Set e2e Resource and Cluster Name run: | - rand=$(git rev-parse --short ${{ inputs.git_sha }}) + rand=${{ steps.vars.outputs.pr_sha_short }} if [ "$rand" = "" ]; then rand=$RANDOM @@ -72,22 +49,36 @@ jobs: echo "REGISTRY=kaito${rand}.azurecr.io" >> $GITHUB_ENV - name: Set Registry - if: ${{ inputs.isRelease }} + if: ${{ github.event.client_payload.isRelease }} run: | - echo "REGISTRY=${{ inputs.registry }}" >> $GITHUB_ENV - echo "VERSION=$(echo ${{ inputs.tag }} | tr -d v)" >> $GITHUB_ENV + echo "REGISTRY=${{ github.event.client_payload.registry }}" >> $GITHUB_ENV + echo "VERSION=$(echo ${{ github.event.client_payload.tag }} | tr -d v)" >> $GITHUB_ENV - name: Set up Go ${{ env.GO_VERSION }} uses: actions/setup-go@v5 with: go-version: ${{ env.GO_VERSION }} - - name: Az login - uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0 + - name: Checkout + if: ${{ !github.event.client_payload.isRelease }} + uses: actions/checkout@v4 with: - client-id: ${{ secrets.E2E_CLIENT_ID }} - tenant-id: ${{ secrets.E2E_TENANT_ID }} - subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }} + submodules: true + fetch-depth: 0 + + - name: Checkout + uses: actions/checkout@v4 + if: ${{ github.event.client_payload.isRelease }} + with: + fetch-depth: 0 + submodules: true + ref: ${{ env.REPO_TAG }} + + - uses: azure/login@v1.6.1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - uses: azure/setup-helm@v4 with: @@ -116,7 +107,7 @@ jobs: az identity create --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }} - name: build KAITO image - if: ${{ !inputs.isRelease }} + if: ${{ !github.event.client_payload.isRelease }} shell: bash run: | make docker-build-kaito @@ -132,8 +123,6 @@ jobs: AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }} AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} - AZURE_LOCATION: ${{ inputs.region }} - AKS_K8S_VERSION: ${{ inputs.k8s_version }} - name: Install gpu-provisioner helm chart shell: bash @@ -144,18 +133,18 @@ jobs: AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} - - uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0 + - uses: azure/login@v1.6.1 with: - client-id: ${{ secrets.E2E_CLIENT_ID }} - tenant-id: ${{ secrets.E2E_TENANT_ID }} - subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }} + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - name: Create Role Assignment uses: azure/CLI@v1.0.9 with: inlineScript: | IDENTITY_PRINCIPAL_ID="$(az identity show --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }} --query 'principalId' -otsv)" - az role assignment create --assignee ${IDENTITY_PRINCIPAL_ID} --scope "/subscriptions/${{ secrets.E2E_SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "Contributor" + az role assignment create --assignee ${IDENTITY_PRINCIPAL_ID} --scope "/subscriptions/${{ secrets.AZURE_SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "Contributor" - name: Create Azure Federated Identity uses: azure/CLI@v1.0.9 @@ -175,14 +164,14 @@ jobs: AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} REGISTRY: ${{ env.REGISTRY }} VERSION: ${{ env.VERSION }} - + - name: Add Secret Credentials run: | - kubectl create secret docker-registry ${{ secrets.E2E_AMRT_SECRET_NAME }} \ - --docker-server=${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io \ - --docker-username=${{ secrets.E2E_ACR_AMRT_USERNAME }} \ - --docker-password=${{ secrets.E2E_ACR_AMRT_PASSWORD }} - + kubectl create secret docker-registry ${{secrets.AMRT_SECRET_NAME}} \ + --docker-server=${{secrets.ACR_AMRT_USERNAME}}.azurecr.io \ + --docker-username=${{secrets.ACR_AMRT_USERNAME}} \ + --docker-password=${{secrets.ACR_AMRT_PASSWORD}} + - name: Log kaito-workspace run: | kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {} @@ -193,8 +182,8 @@ jobs: env: AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} RUN_LLAMA_13B: ${{ env.RUN_LLAMA_13B }} - AI_MODELS_REGISTRY: ${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io - AI_MODELS_REGISTRY_SECRET: ${{ secrets.E2E_AMRT_SECRET_NAME }} + AI_MODELS_REGISTRY: ${{secrets.ACR_AMRT_USERNAME}}.azurecr.io + AI_MODELS_REGISTRY_SECRET: ${{secrets.AMRT_SECRET_NAME}} - name: Cleanup e2e resources if: ${{ always() }} diff --git a/.github/workflows/helm-chart.yml b/.github/workflows/helm-chart.yml index 2df542691..05e06123e 100644 --- a/.github/workflows/helm-chart.yml +++ b/.github/workflows/helm-chart.yml @@ -16,6 +16,11 @@ jobs: publish-helm: runs-on: ubuntu-latest steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + - name: Checkout uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: @@ -24,7 +29,7 @@ jobs: ref: ${{ github.event.client_payload.tag }} - name: Publish Workspace Helm chart - uses: stefanprodan/helm-gh-pages@v1.7.0 + uses: stefanprodan/helm-gh-pages@0ad2bb377311d61ac04ad9eb6f252fb68e207260 # v1.7.0 with: token: ${{ secrets.GITHUB_TOKEN }} charts_dir: charts/kaito diff --git a/.github/workflows/lint-go.yml b/.github/workflows/lint-go.yml index 5c523f766..3170149c7 100644 --- a/.github/workflows/lint-go.yml +++ b/.github/workflows/lint-go.yml @@ -15,6 +15,9 @@ on: env: GO_VERSION: '1.22' +permissions: + contents: read + jobs: build: runs-on: ubuntu-latest @@ -32,7 +35,7 @@ jobs: fetch-depth: 0 - name: Set up Go ${{ env.GO_VERSION }} - uses: actions/setup-go@v5 + uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0 with: go-version: ${{ env.GO_VERSION }} diff --git a/.github/workflows/markdown-link-check.yml b/.github/workflows/markdown-link-check.yml index 00e6c44f7..2fb144a3e 100644 --- a/.github/workflows/markdown-link-check.yml +++ b/.github/workflows/markdown-link-check.yml @@ -3,18 +3,26 @@ name: markdown link on: pull_request: paths: - - '**.md' - - 'docs/**' + - "**.md" + - "docs/**" + +permissions: + contents: read jobs: markdown-link-check: runs-on: ubuntu-latest steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - uses: gaurav-nelson/github-action-markdown-link-check@v1 + - uses: gaurav-nelson/github-action-markdown-link-check@5c5dfc0ac2e225883c0e5f03a85311ec2830d368 # v1 with: # this will only show errors in the output - use-quiet-mode: 'yes' + use-quiet-mode: "yes" # this will show detailed HTTP status for checked links - use-verbose-mode: 'yes' - config-file: '.github/markdown.links.config.json' + use-verbose-mode: "yes" + config-file: ".github/markdown.links.config.json" diff --git a/.github/workflows/pr-title-lint.yml b/.github/workflows/pr-title-lint.yml index 160227fe1..397663dac 100644 --- a/.github/workflows/pr-title-lint.yml +++ b/.github/workflows/pr-title-lint.yml @@ -12,7 +12,12 @@ jobs: check: runs-on: ubuntu-latest steps: - - uses: thehanimo/pr-title-checker@v1.4.2 + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + + - uses: thehanimo/pr-title-checker@1d8cd483a2b73118406a187f54dca8a9415f1375 # v1.4.2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} pass_on_octokit_error: true diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index e0fb52696..aa19b3ec4 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -9,14 +9,14 @@ on: branches: - main paths: - - 'presets/inference/**' - - 'presets/models/supported_models.yaml' + - "presets/inference/**" + - "presets/models/supported_models.yaml" push: branches: - main paths: - - 'presets/inference/**' - - 'presets/models/supported_models.yaml' + - "presets/inference/**" + - "presets/models/supported_models.yaml" workflow_dispatch: inputs: force-run-all: @@ -25,30 +25,35 @@ on: description: "Run all models for build" env: - GO_VERSION: "1.22" - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + GO_VERSION: "1.22" + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} permissions: id-token: write contents: read jobs: - determine-models: + determine-models: runs-on: ubuntu-latest environment: preset-env outputs: matrix: ${{ steps.affected_models.outputs.matrix }} is_matrix_empty: ${{ steps.check_matrix_empty.outputs.is_empty }} - steps: + steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: submodules: true fetch-depth: 0 - + - name: Set FORCE_RUN_ALL Flag run: echo "FORCE_RUN_ALL=${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}" >> $GITHUB_ENV - + # This script should output a JSON array of model names - name: Determine Affected Models id: affected_models @@ -56,11 +61,11 @@ jobs: PR_BRANCH=${{ env.BRANCH_NAME }} \ FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ python3 .github/workflows/kind-cluster/determine_models.py - + - name: Print Determined Models run: | echo "Output from affected_models: ${{ steps.affected_models.outputs.matrix }}" - + - name: Check if Matrix is Empty id: check_matrix_empty run: | @@ -69,11 +74,11 @@ jobs: else echo "is_empty=false" >> $GITHUB_OUTPUT fi - + build-models: needs: determine-models if: needs.determine-models.outputs.is_matrix_empty == 'false' - runs-on: [self-hosted, 'hostname:model-server'] + runs-on: [self-hosted, "hostname:model-server"] environment: preset-env strategy: fail-fast: false @@ -81,35 +86,40 @@ jobs: model: ${{fromJson(needs.determine-models.outputs.matrix)}} max-parallel: 3 steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: submodules: true fetch-depth: 0 - + - name: Check Available Disk Space run: df -h - name: Install Azure CLI latest run: | - if ! which az > /dev/null; then - echo "Azure CLI not found. Installing..." - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - else - echo "Azure CLI already installed." - fi - - - name: 'Az CLI login' - uses: azure/login@v1.6.1 + if ! which az > /dev/null; then + echo "Azure CLI not found. Installing..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + else + echo "Azure CLI already installed." + fi + + - name: "Az CLI login" + uses: azure/login@cb79c773a3cfa27f31f25eb3f677781210c9ce3d # v1.6.1 with: - client-id: ${{ secrets.AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - allow-no-subscriptions: true - - - name: 'Set subscription' + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + allow-no-subscriptions: true + + - name: "Set subscription" run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} - - name: 'Get ACR Info' + - name: "Get ACR Info" id: acr_info run: | ACR_NAME="${{ secrets.ACR_AMRT_USERNAME }}" @@ -118,29 +128,29 @@ jobs: echo "ACR_NAME=$ACR_NAME" >> $GITHUB_OUTPUT echo "ACR_USERNAME=$ACR_USERNAME" >> $GITHUB_OUTPUT echo "ACR_PASSWORD=$ACR_PASSWORD" >> $GITHUB_OUTPUT - - - name: 'Check if Image exists in Test ACR' + + - name: "Check if Image exists in Test ACR" id: check_test_image run: | - ACR_NAME=${{ steps.acr_info.outputs.ACR_USERNAME }} - IMAGE_NAME=${{ matrix.model.name }} - TAG=${{ matrix.model.tag }} - - # Use '|| true' to prevent script from exiting with an error if the repository is not found - TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) - - if [[ -z "$TAGS" ]]; then - echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." - echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT - else - if echo "$TAGS" | grep -q "^$TAG$"; then - echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT - else - echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT - echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." - fi - fi - + ACR_NAME=${{ steps.acr_info.outputs.ACR_USERNAME }} + IMAGE_NAME=${{ matrix.model.name }} + TAG=${{ matrix.model.tag }} + + # Use '|| true' to prevent script from exiting with an error if the repository is not found + TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) + + if [[ -z "$TAGS" ]]; then + echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + else + if echo "$TAGS" | grep -q "^$TAG$"; then + echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT + else + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." + fi + fi + - name: Launch Python Script to Kickoff Build Jobs if: steps.check_test_image.outputs.IMAGE_EXISTS == 'false' id: launch_script @@ -155,7 +165,7 @@ jobs: MODEL_RUNTIME=${{ matrix.model.runtime }} \ MODEL_TAG=${{ matrix.model.tag }} \ python3 .github/workflows/kind-cluster/main.py - + # Check the exit status of the Python script - name: Check Python Script Status if: ${{ always() }} diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml new file mode 100644 index 000000000..b4c4e07e0 --- /dev/null +++ b/.github/workflows/scorecards.yml @@ -0,0 +1,76 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + - cron: '20 7 * * 2' + push: + branches: ["main"] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + contents: read + actions: read + + steps: + - name: Harden Runner + uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0 + with: + egress-policy: audit + + - name: "Checkout code" + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecards on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard. + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@a82bad71823183e5b120ab52d521460ecb0585fe # v2.24.9 + with: + sarif_file: results.sarif diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3afaa2ff9..46eedd3ec 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,27 +1,27 @@ name: unit-tests concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true on: push: branches: [main] - paths-ignore: ['docs/**', '**.md', '**.mdx', '**.png', '**.jpg'] + paths-ignore: ["docs/**", "**.md", "**.mdx", "**.png", "**.jpg"] pull_request: branches: [main, release-**] - paths-ignore: ['docs/**', '**.md', '**.mdx', '**.png', '**.jpg'] + paths-ignore: ["docs/**", "**.md", "**.mdx", "**.png", "**.jpg"] permissions: contents: read packages: write - + env: - GO_VERSION: '1.22' + GO_VERSION: "1.22" jobs: unit-tests: - runs-on: ubuntu-latest + runs-on: ubuntu-latest environment: unit-tests steps: - name: Harden Runner @@ -35,8 +35,8 @@ jobs: submodules: true fetch-depth: 0 - - name: Set up Go ${{ env.GO_VERSION }} - uses: actions/setup-go@v5 + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0 with: go-version: ${{ env.GO_VERSION }} @@ -49,7 +49,7 @@ jobs: make inference-api-e2e - name: Upload Codecov report - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab # v4.1.0 with: ## Comma-separated list of files to upload files: ./coverage.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..467d0e0e5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: + - repo: https://github.com/gitleaks/gitleaks + rev: v8.16.3 + hooks: + - id: gitleaks + - repo: https://github.com/golangci/golangci-lint + rev: v1.52.2 + hooks: + - id: golangci-lint + - repo: https://github.com/jumanjihouse/pre-commit-hooks + rev: 3.0.0 + hooks: + - id: shellcheck + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/pylint-dev/pylint + rev: v2.17.2 + hooks: + - id: pylint diff --git a/docker/kaito/Dockerfile b/docker/kaito/Dockerfile index 19a0fd30c..5970adc43 100644 --- a/docker/kaito/Dockerfile +++ b/docker/kaito/Dockerfile @@ -32,7 +32,7 @@ RUN --mount=type=cache,target=${GOCACHE} \ # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details -FROM --platform=$BUILDPLATFORM gcr.io/distroless/static:nonroot +FROM --platform=$BUILDPLATFORM gcr.io/distroless/static:nonroot@sha256:55c636171053dbc8ae07a280023bd787d2921f10e569f3e319f1539076dbba11 WORKDIR / COPY --from=builder /workspace/manager . USER 65532:65532 diff --git a/docker/presets/inference/llama-2/Dockerfile b/docker/presets/inference/llama-2/Dockerfile index 285cb122a..822119736 100644 --- a/docker/presets/inference/llama-2/Dockerfile +++ b/docker/presets/inference/llama-2/Dockerfile @@ -4,7 +4,7 @@ # --build-arg VERSION={{VERSION}} \ # --build-arg MODEL_TYPE={{MODEL_TYPE}} \ -FROM python:3.8-slim +FROM python:3.8-slim@sha256:95bfecec648356cdd0b28c8b00ce00009baff10c99d1126a82d1aca716453a1a WORKDIR /workspace # Install git diff --git a/docker/presets/inference/tfs/Dockerfile b/docker/presets/inference/tfs/Dockerfile index 5a322b8bd..e34309cb3 100644 --- a/docker/presets/inference/tfs/Dockerfile +++ b/docker/presets/inference/tfs/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-slim +FROM python:3.10-slim@sha256:684b1aaf96a7942b3c3af438d162e0baa3510aa7af25ad76d238e0c746bdec79 ARG WEIGHTS_PATH ARG MODEL_TYPE diff --git a/docker/presets/tuning/Dockerfile b/docker/presets/tuning/Dockerfile index 896deb85a..5a9a2d624 100644 --- a/docker/presets/tuning/Dockerfile +++ b/docker/presets/tuning/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-slim +FROM python:3.10-slim@sha256:684b1aaf96a7942b3c3af438d162e0baa3510aa7af25ad76d238e0c746bdec79 ARG WEIGHTS_PATH ARG MODEL_TYPE