-
Notifications
You must be signed in to change notification settings - Fork 68
104 lines (100 loc) · 3.42 KB
/
correctness.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
name: Correctness tests
on:
workflow_dispatch:
inputs:
djl-version:
description: 'The released version of DJL'
required: false
default: ''
schedule:
- cron: '0 9 * * *'
jobs:
create-runners-p4d:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new P4d.24xl instance
id: create_gpu_p4d
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_lmic_p4d $token djl-serving
outputs:
p4d_instance_id: ${{ steps.create_gpu_p4d.outputs.action_lmic_p4d_instance_id }}
test:
runs-on: [ "${{ matrix.test.instance }}" ]
timeout-minutes: 90
needs: create-runners-p4d
strategy:
fail-fast: false
matrix:
test:
- test: TestCorrectnessLmiDist
instance: p4d
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
distribution: 'corretto'
java-version: 17
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
- name: Install torch
# Use torch to get cuda capability of current device to selectively run tests
# Torch version doesn't really matter that much
run: |
pip3 install torch==2.3.0
- name: Install awscurl
working-directory: tests/integration
run: |
curl -OL https://publish.djl.ai/awscurl/awscurl
chmod +x awscurl
mkdir outputs
- name: Test
working-directory: tests/integration
env:
TEST_DJL_VERSION: ${{ inputs.djl-version }}
run: |
python -m pytest -k ${{ matrix.test.test }} tests.py
- name: Cleanup
working-directory: tests/integration
run: |
rm -rf outputs
rm awscurl
- name: On Failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
docker rm -f $(docker ps -aq) || true
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: test-${{ matrix.test.test }}-logs
path: tests/integration/all_logs/
stop-runners-p4d:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners-p4d, test ]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners-p4d.outputs.p4d_instance_id }}
./stop_instance.sh $instance_id