forked from OleHolmNielsen/Slurm_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalive
executable file
·44 lines (38 loc) · 910 Bytes
/
alive
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
#
# Check all nodes in a Slurm cluster by pinging each one
#
# Author: Ole Holm Nielsen, [email protected]
#
# Exit the script in case of control-C etc.
trap "exit -1" SIGTERM SIGINT SIGQUIT
PING="/bin/ping -c 1 -w 3"
# The Slurm server nodes list: Check only dead nodes (--dead) and other failed nodes
# Comment: Nodes that are powering_up/down or boot^ should not be listed
SINFO="/usr/bin/sinfo -N -h -t down,fail,unknown"
NODELIST=`$SINFO -O NodeList | sort | uniq`
STOPFILEDIR=/var/lib/alive
if test ! -d $STOPFILEDIR
then
mkdir -v $STOPFILEDIR
fi
for node in $NODELIST
do
STOPFILE=$STOPFILEDIR/alive.${node}
if $PING $node >/dev/null 2>&1
then
if [ -f $STOPFILE ]
then
echo Host ${node} is up again
$SINFO -n $node
rm -f $STOPFILE
fi
else
if [ ! -f $STOPFILE ]
then
echo Cannot ping host ${node} !
$SINFO -n $node
touch $STOPFILE
fi
fi
done