-
Notifications
You must be signed in to change notification settings - Fork 98
/
Copy pathpower_ipmi
executable file
·149 lines (134 loc) · 4.34 KB
/
power_ipmi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env bash
# Suspend/resume IPMI-based Slurm nodes using FreeIPMI tools from https://www.gnu.org/software/freeipmi/
# Author: [email protected]
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
# Use with ResumeProgram and SuspendProgram in slurm.conf
# NOTE: The slurmctld will execute this script as user "slurm"
# (see https://slurm.schedmd.com/power_save.html)
# so the slurm user must have credentials for suspending and resuming nodes.
# MODIFY THIS:
# Add these lines (uncommented) to the users' .bashrc file which should export variables like:
# export IPMI_USER=root
# export IPMI_PASSWORD=verysecretpassword
# Define the node BMC DNS name: BMC DNS-name is the node name plus this suffix:
BMC_SUFFIX="b"
# For example: node c190 BMC has DNS name c190b
# Logfile for IPMI suspend/resume actions
# NOTE: Make sure this file is writable by SlurmUser
LOGFILE=/var/log/slurm/power_ipmi.log
# The slurm user must own the $LOGFILE
slurmuser="`scontrol show config | grep SlurmUser | awk '{split($3,a,"("); print a[1]}'`"
# Prerequisites:
# * Install this RPM package: yum install freeipmi
# * We require the "nodeset" command from the ClusterShell package. Install it by:
# yum install epel-release
# yum install clustershell
# Command usage:
function usage()
{
cat <<EOF
Usage: $0 [-r|-s|-f|-q|-h] nodelist
where the action is:
-r: Resume (start) nodes in nodelist
-s: Suspend (stop) nodes in nodelist
-f: FORCE power-off nodes in nodelist
-c: Power cycle nodes in nodelist
-R: Reset nodes in nodelist
-q: Query power status of nodes in nodelist
-h: Print this help information
EOF
}
# Set the ipmipower command action
action=""
logging=1
while getopts "rsfcRqh" options; do
case $options in
r ) action="--on --on-if-off"
;;
s ) action="--soft --wait-until-off"
;;
f ) action="--off"
;;
c ) action="--cycle"
;;
R ) action="--reset"
;;
q ) action="--stat"
logging=0
;;
h|? ) usage
exit 0;;
* ) usage
exit 1;;
esac
done
shift $((OPTIND-1))
# Check the Slurm nodelist
if [[ $# != 1 ]]
then
echo "ERROR: No Slurm nodelist has been given"
usage
exit 1
fi
# List of nodenames and BMC DNS names
nodelist=$1
nodecount=`nodeset --count $nodelist`
# Append the BMC's DNS name suffix BMC_SUFFIX to the nodes' DNS names
# using the ClusterShell command "nodeset"
bmclong=`nodeset -O "%s${BMC_SUFFIX}" --expand $1`
bmclist=`nodeset --fold $bmclong`
# Source the users' .bashrc file which should export variables like:
# export IPMI_USER=root
# export IPMI_PASSWORD=verysecretpassword
# Note: The environment variables set by slurmctld do NOT include PATH, USER, HOME etc.
source ~/.bashrc
USER=`whoami`
# Prepend the path where FreeIPMI tools live
export PATH=/usr/sbin:$PATH
if [[ -z "$IPMI_USER" || -z "$IPMI_PASSWORD" ]]
then
echo "ERROR: The user IPMI_USER and/or password IPMI_PASSWORD have not been set in ~/.bashrc" | tee --append $LOGFILE
exit 1
fi
if [[ -z "$action" ]]
then
echo "ERROR: No action has been given" | tee --append $LOGFILE
usage
exit 1
elif [[ $logging -eq 1 ]]
then
# The case where we want logging to go to $LOGFILE
# Make sure the LOGFILE is owned by SlurmUser and has correct permissions
touch $LOGFILE
chown $slurmuser: $LOGFILE
chmod 644 $LOGFILE
# Do the resume or suspend action:
# Redirect stdout and stderr to $LOGFILE
exec &>> $LOGFILE
DATE=`date +'%b %d %T'`
echo "$DATE Invoked $0" by `id $USER`
# Display $action in UPPER case (see the bash man-page under Case modification)
echo "$DATE POWER ${action^^} the IPMI based nodelist $nodelist ($nodecount nodes)"
fi
#
# Use The FreeIPMI command "ipmipower" to power nodes on or off or get status
#
# Specify the IPMI 2.0 cipher suite ID to use:
# HPE and SuperMicro BMC only support "-I 3"
cipher="-I 17"
# fallbackcipher="-I 3"
fallbackcipher=""
driver="-D LAN_2_0"
timeout="--session-timeout=1000" # Set IPMI timeout to 1000 milliseconds
consolidate="--consolidate-output" # Consolidated node output
# First try cipher suite 17 with a fallback to 3 (for HPE and SuperMicro BMCs)
tempfile=`mktemp`
ipmipower $driver $cipher --username=$IPMI_USER --password=$IPMI_PASSWORD $timeout $consolidate --hostname=$bmclist $action > $tempfile 2>&1
if [ $? -eq 0 ]
then
cat $tempfile
else
cat $tempfile
ipmipower $driver $fallbackcipher --username=$IPMI_USER --password=$IPMI_PASSWORD $timeout $consolidate --hostname=$bmclist $action
fi
rm -f $tempfile