-
Notifications
You must be signed in to change notification settings - Fork 59
136 lines (121 loc) · 5.27 KB
/
e2e-preset-tuning-test.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
name: E2E Preset tuning Test
on:
workflow_run:
workflows: ["Build and Push Preset Models"]
types:
- completed
workflow_dispatch: {}
env:
GO_VERSION: "1.22"
permissions:
id-token: write
contents: read
jobs:
e2e-preset-tuning-tests:
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
runs-on: ubuntu-latest
environment: preset-env
steps:
- name: Checkout
uses: actions/[email protected]
with:
submodules: true
fetch-depth: 0
- name: 'Az CLI login'
uses: azure/[email protected]
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
allow-no-subscriptions: true
- name: 'Set ACR Subscription'
run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}}
- name: Set up kubectl context
run: |
az aks get-credentials --resource-group llm-test --name GitRunner
- name: Get test meta
id: get_test_meta
run: |
jq --version
sudo apt install jq -y
CONFIG=$(jq '.matrix.image[] | select(.name == "tuning-example")' .github/e2e-preset-configs.json)
echo "TAG=0.0.7" >> $GITHUB_OUTPUT
echo "model=$CONFIG" >> $GITHUB_OUTPUT
- name: Create Nodepool
run: |
NODEPOOL_EXIST=$(az aks nodepool show \
--name ${{ steps.get_test_meta.outputs.model.name }} \
--cluster-name GitRunner \
--resource-group llm-test \
--query 'name' -o tsv || echo "")
echo "NODEPOOL_EXIST: $NODEPOOL_EXIST"
if [ -z "$NODEPOOL_EXIST" ]; then
az aks nodepool add \
--name ${{ steps.get_test_meta.outputs.model.name }} \
--cluster-name GitRunner \
--resource-group llm-test \
--node-count ${{ steps.get_test_meta.outputs.model.node-count }} \
--node-vm-size ${{ steps.get_test_meta.outputs.model.node-vm-size }} \
--node-osdisk-size ${{ steps.get_test_meta.outputs.model.node-osdisk-size }} \
--labels pool=${{ steps.get_test_meta.outputs.model.name }} \
--node-taints sku=gpu:NoSchedule \
--aks-custom-headers UseGPUDedicatedVHD=true
else
NODEPOOL_STATE=$(az aks nodepool show \
--name ${{ steps.get_test_meta.outputs.model.name }} \
--cluster-name GitRunner \
--resource-group llm-test \
--query 'provisioningState' -o tsv)
echo "NODEPOOL_STATE: $NODEPOOL_STATE"
if [ "$NODEPOOL_STATE" != "Succeeded" ]; then
echo "Nodepool exists but is not in a Succeeded state. Please check manually."
exit 1
else
echo "Nodepool already exists and is in a running state."
fi
fi
- name: Replace repo and Deploy Resource to K8s
run: |
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/tuning/tuning-job.yaml
sed -i "s/TAG_HERE/${{ steps.get_test_meta.outputs.TAG }}/g" presets/workspace/test/tuning/tuning-job.yaml
kubectl apply -f presets/workspace/test/tuning/tuning-job.yaml
- name: Wait for tuning job to be ready
shell: bash {0}
run: |
retval_complete=1
retval_failed=1
count=0
max_retries=60
while [[ $retval_complete -ne 0 ]] && [[ $retval_failed -ne 0 ]] && [[ $count -lt $max_retries ]]; do
sleep 10
output=$(kubectl wait --for=condition=failed job/tuning-example --timeout=0 2>&1)
retval_failed=$?
output=$(kubectl wait --for=condition=complete job/tuning-example --timeout=0 2>&1)
retval_complete=$?
count=$((count + 1))
done
if [ $retval_failed -eq 0 ]; then
echo "Job failed. Please check logs."
exit 1
elif [ $retval_complete -ne 0 ]; then
echo "Job timeout."
exit 1
else
echo "Job succeeded."
fi
- name: Cleanup
if: always()
run: |
kubectl delete --wait=true -f presets/workspace/test/tuning/tuning-job.yaml
# Check and Delete AKS Nodepool if it exists
NODEPOOL_EXIST=$(az aks nodepool show \
--name ${{ steps.get_test_meta.outputs.model.name }} \
--cluster-name GitRunner \
--resource-group llm-test \
--query 'name' -o tsv || echo "")
if [ -n "$NODEPOOL_EXIST" ]; then
echo "deleting nodepool"
az aks nodepool delete \
--name ${{ steps.get_test_meta.outputs.model.name }} \
--cluster-name GitRunner \
--resource-group llm-test
fi