-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_vdi.sh
executable file
·151 lines (115 loc) · 4.92 KB
/
run_vdi.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/bin/bash
#------------------------------------------------------------------------
#------------------------- define nci cluster size ----------------------
#------------------------------------------------------------------------
#------------------------------------------------------------------------
#------------------------- define task parameters -----------------------
#------------------------------------------------------------------------
MLHOME=/g/data/ge3/$USER # where you have installed venvs/MLWorkflow, etc
iteration=1
jupyterPort=838$iteration
rayDashboardPort=848$iteration
rayPort=637$iteration
inputConfigFile=/g/data/ge3/axb562/github/My_MLFLOW/reference_configuration.ini
#------------------------------------------------------------------------
#------------------------- load nci modules -----------------------------
#------------------------------------------------------------------------
module purge
module load pbs
# module load python3-as-python
module load gdal/3.0.2
set -e
ulimit -s unlimited
echo "A01: " $MLHOME
echo "A02: " $inputConfigFile
# source $MLHOME/venvs/MLWorkflow/bin/activate
source /g/data/cb01/anaconda3/bin/activate mlflow
cd $MLHOME/github/MLWorkflow
#------------------------------------------------------------------------
#------------------------- setup ray cluster ----------------------------
#------------------------------------------------------------------------
echo "A03: " $PWD
cd $PWD
ncpu=`nproc`
echo "A04: " $ncpu
nodeDnsIps=`hostname`
echo "A05: " $nodeDnsIps
hostNodeDnsIp=`uname -n`
echo "A06: " $hostNodeDnsIp
hostNodeIp=`hostname -i`
echo "A07: " $hostNodeIp
rayDashboardPort=$rayDashboardPort
echo "A08: " $rayDashboardPort
rayPassword='5241590000000000'
cat > $PWD/${iteration}_setupRayWorkerNode.sh << 'EOF'
#!/bin/bash -l
set -e
ulimit -s unlimited
cd $PWD
hostNodeIp=${1}
rayPort=${2}
rayPassword=${3}
MLHOME=${4}
hostIpNPort=$hostNodeIp:$rayPort
module purge
module load pbs
# module load python3-as-python
module load gdal/3.0.2
# source $MLHOME/venvs/MLWorkflow/bin/activate
source /g/data/cb01/anaconda3/bin/activate mlflow
cd $MLHOME/github/MLWorkflow
echo "running node to ray cluster"
echo "A12: " `uname -n`
echo "A13: " `hostname -i`
echo "A14: " `ray start --address=$hostIpNPort --num-cpus=16 --redis-password='5241590000000000' --block &`
EOF
chmod +x $PWD/${iteration}_setupRayWorkerNode.sh
echo "set up ray cluster......."
for nodeDnsIp in `echo ${nodeDnsIps}`
do
if [[ ${nodeDnsIp} == "${hostNodeDnsIp}" ]]
then
echo "Starting ray cluster on head node ..."
module purge
module load pbs
# module load python3-as-python
module load gdal/3.0.2
# source $MLHOME/venvs/MLWorkflow/bin/activate
source /g/data/cb01/anaconda3/bin/activate mlflow
cd $MLHOME/github/MLWorkflow
ray start --head --num-cpus=${ncpu} --include-dashboard=true --dashboard-host=0.0.0.0 --dashboard-port=${rayDashboardPort} --port=${rayPort}
sleep 10
else
echo "Starting ray cluster on worker node ..."
ssh "${nodeDnsIp}" $PWD/${iteration}_setupRayWorkerNode.sh "${hostNodeIp}" "${rayPort}" "${rayPassword}" "${MLHOME}" &
sleep 5
fi
done
echo "Creating ray connection string ..."
echo "ssh -N -L ${rayDashboardPort}:${hostNodeDnsIp}:${rayDashboardPort} ${USER}@gadi.nci.org.au &" > ${PWD}/${iteration}_connection_strings.txt
#------------------------------------------------------------------------
#------------------------- setup jupyter notebook -----------------------
#------------------------------------------------------------------------
hostNodeDnsIp=`uname -n`
echo "A15: " $hostNodeDnsIp
echo "Starting Jupyter lab ..."
echo "A16: " $jupyterPort
jupyter notebook --no-browser --port ${jupyterPort} --no-browser --ip=${hostNodeDnsIp} --NotebookApp.token='' --NotebookApp.password='' &
echo "Creating jupyter connection string ..."
echo "ssh -N -L ${jupyterPort}:${hostNodeDnsIp}:${jupyterPort} ${USER}@gadi.nci.org.au &" >> ${PWD}/${iteration}_connection_strings.txt
#------------------------------------------------------------------------
#------------------------- run ml workflow ------------------------------
#------------------------------------------------------------------------
cd $MLHOME/github/MLWorkflow
python -m mlwkf -c $inputConfigFile
# sleep infinity # this allows the pbs nodes to persist until requested wall timeout, therefore you can run jupyter notebook and terminal in a browser
#------------------------------------------------------------------------
#------------------------- gracefully exit ------------------------------
#------------------------------------------------------------------------
# rm *setupRayWorkerNode.sh -f
# rm *connection_strings* -f
# rm mlflowpbs* -f
# rm core.ray:* -f
# rm core.raylet* -f
# rm core.store* -f
ray stop