forked from rapidsai/multi-gpu-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-nightly-benchmarks.sh
executable file
·178 lines (156 loc) · 8.32 KB
/
run-nightly-benchmarks.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/bin/bash
# Copyright (c) 2021, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
RAPIDS_MG_TOOLS_DIR=${RAPIDS_MG_TOOLS_DIR:-$(cd $(dirname $0); pwd)}
source ${RAPIDS_MG_TOOLS_DIR}/script-env.sh
# FIXME: this is project-specific and should happen at the project level.
module load cuda/11.2.2.0
activateCondaEnv
# FIXME: enforce 1st arg is present
NUM_GPUS=$1
NUM_NODES=$(python -c "from math import ceil;print(int(ceil($NUM_GPUS/float($GPUS_PER_NODE))))")
# Creates a string "0,1,2,3" if NUM_GPUS=4, for example, which can be
# used for setting CUDA_VISIBLE_DEVICES on single-node runs.
ALL_GPU_IDS=$(python -c "print(\",\".join([str(n) for n in range($NUM_GPUS)]))")
SCALES=("9" "10" "11")
ALGOS=(bfs sssp pagerank wcc louvain katz)
#ALGOS=(bfs)
SYMMETRIZED_ALGOS=(sssp wcc louvain)
WEIGHTED_ALGOS=(sssp)
scales_array=${SCALES[((NUM_NODES/2))]}
# NOTE: it's assumed BENCHMARK_DIR has been created elsewhere! For
# example, cronjob.sh calls this script multiple times in parallel, so
# it will create, populate, etc. BENCHMARK_DIR once ahead of time.
export CUPY_CACHE_DIR=${BENCHMARK_DIR} #change this after removing the cugraph-benchmark directory
# Function for running a command that gets killed after a specific timeout and
# logs a timeout message. This also sets ERRORCODE appropriately.
LAST_EXITCODE=0
function handleTimeout {
seconds=$1
eval "timeout --signal=2 --kill-after=60 $*"
LAST_EXITCODE=$?
if (( $LAST_EXITCODE == 124 )); then
logger "ERROR: command timed out after ${seconds} seconds"
elif (( $LAST_EXITCODE == 137 )); then
logger "ERROR: command timed out after ${seconds} seconds, and had to be killed with signal 9"
fi
ERRORCODE=$((ERRORCODE | ${LAST_EXITCODE}))
}
# set +e so the script continues to execute commands even if they return
# non-0. This is needed so all benchmark commands run, but also means the exit code
# for this script must be managed separately in order to indicate that ALL benchmark
# commands passed vs. just the last one.
set +e
set -o pipefail
ERRORCODE=0
RUN_DASK_CLUSTER_PID=""
########################################
cd $BENCHMARK_DIR
# create a directory benchmark in workerspace
export RAPIDS_DATASET_ROOT_DIR=$DATASETS_DIR
# this is cugraph-specific.
for algo in ${ALGOS[*]}; do
for scale in ${scales_array[*]}; do
# Create a log dir per benchamrk file per configuration. This will
# contain all dask scheduler/worker logs, the stdout/stderr of the
# benchmark run itself, and any reports (XML, etc.) from the benchmark run
# for the benchmark file. Export this var so called scripts will pick
# it up.
RELATIVE_LOGS_DIR="${algo}_scale${scale}_num_nodes${NUM_NODES}/${NUM_GPUS}-GPUs"
export LOGS_DIR="${BENCHMARK_RESULTS_DIR}/${RELATIVE_LOGS_DIR}"
mkdir -p $LOGS_DIR
setTee ${LOGS_DIR}/benchmark_output_log.txt
echo -e "\n>>>>>>>> RUNNING BENCHMARK: $algo - ${NUM_GPUS}-GPUs <<<<<<<<"
echo -e "\n>>>>>>>>>>>Scale: $scale"
DASK_STARTUP_ERRORCODE=0
if [[ $NUM_NODES -gt 1 ]]; then
export UCX_MAX_RNDV_RAILS=1
# python benchmarks will look for env var SCHEDULER_FILE when
# determining what type of Dask cluster to create, so export
# it here for subprocesses to see.
export SCHEDULER_FILE=$SCHEDULER_FILE
# srun runs a task per node by default
uniqueJobName=$(uuidgen | cut -d'-' -f1)
srun --export="ALL,SCRIPTS_DIR=$SCRIPTS_DIR" --job-name=$uniqueJobName --partition=batch --output=/dev/null ${SCRIPTS_DIR}/run-cluster-dask-jobs.sh &
RUN_DASK_CLUSTER_PID=$!
handleTimeout 120 python ${SCRIPTS_DIR}/wait_for_workers.py --num-expected-workers=$NUM_GPUS --scheduler-file-path=$SCHEDULER_FILE
DASK_STARTUP_ERRORCODE=$LAST_EXITCODE
else
export CUDA_VISIBLE_DEVICES=$ALL_GPU_IDS
logger "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
fi
if [[ $DASK_STARTUP_ERRORCODE == 0 ]]; then
logger "RUNNING benchmark for algo $algo"
if echo ${SYMMETRIZED_ALGOS[*]} | grep -q -w "$algo"; then
if echo ${WEIGHTED_ALGOS[*]} | grep -q -w "$algo"; then
if [[ $NUM_NODES -gt 1 ]]; then
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --symmetric-graph --dask-scheduler-file=$SCHEDULER_FILE --benchmark-dir=$BENCHMARK_RESULTS_DIR
else
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --symmetric-graph --benchmark-dir=$BENCHMARK_RESULTS_DIR --rmm-pool-size=$WORKER_RMM_POOL_SIZE
fi
else
if [[ $NUM_NODES -gt 1 ]]; then
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --symmetric-graph --unweighted --dask-scheduler-file=$SCHEDULER_FILE --benchmark-dir=$BENCHMARK_RESULTS_DIR
else
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --symmetric-graph --unweighted --benchmark-dir=$BENCHMARK_RESULTS_DIR --rmm-pool-size=$WORKER_RMM_POOL_SIZE
fi
fi
else
if echo ${WEIGHTED_ALGOS[*]} | grep -q -w "$algo"; then
if [[ $NUM_NODES -gt 1 ]]; then
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --dask-scheduler-file=$SCHEDULER_FILE --benchmark-dir=$BENCHMARK_RESULTS_DIR
else
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --benchmark-dir=$BENCHMARK_RESULTS_DIR --rmm-pool-size=$WORKER_RMM_POOL_SIZE
fi
else
if [[ $NUM_NODES -gt 1 ]]; then
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --unweighted --dask-scheduler-file=$SCHEDULER_FILE --benchmark-dir=$BENCHMARK_RESULTS_DIR
else
handleTimeout 600 python ${BENCHMARK_DIR}/python_e2e/main.py --algo=$algo --scale=$scale --unweighted --benchmark-dir=$BENCHMARK_RESULTS_DIR --rmm-pool-size=$WORKER_RMM_POOL_SIZE
fi
fi
fi
BENCHMARK_ERRORCODE=$LAST_EXITCODE
else
logger "Dask processes failed to start, not running benchmarks for $algo."
fi
if [[ $NUM_NODES -gt 1 ]]; then
# Killing the script running all Dask processes on all nodes
# (scheduler, all workers) will stop those processes. The nodes
# running those processes will still be allocated to this job,
# and can/will be used to run the same Dask processes again
# for the next benchmark.
kill $RUN_DASK_CLUSTER_PID
else
logger "stopping any remaining dask/python processes"
pkill dask
pkill python
pgrep -la dask
pgrep -la python
fi
if [[ $DASK_STARTUP_ERRORCODE == 0 ]]; then
logger "python exited with code: $BENCHMARK_ERRORCODE, run-nightly-benchmark.sh overall exit code is: $ERRORCODE"
fi
unsetTee
# Generate a crude report containing the status of each benchmark file.
benchmark_status_string=PASSED
if [[ $BENCHMARK_ERRORCODE != 0 ]]; then
benchmark_status_string=FAILED
fi
echo "Benchmarking $algo $benchmark_status_string ./${RELATIVE_LOGS_DIR}" >> ${BENCHMARK_RESULTS_DIR}/benchmark-results-${NUM_GPUS}-GPUs.txt
#fi
sleep 2
done
done
logger "Exiting \"run-nightly-benchmarks.sh $NUM_GPUS\" with $ERRORCODE"
exit $ERRORCODE