Skip to content

Commit 59cda92

Browse files
author
Martijn Kruiten
committed
Node mark reboot helper
1 parent e821cef commit 59cda92

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

helpers/node-mark-reboot

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
#
3+
# SURFsara Node Health Check -- Node Rebooting Helper
4+
#
5+
# Martijn Kruiten <[email protected]>
6+
# 26 apr 2018
7+
#
8+
9+
# This script is a simple wrapper that the node health check can run
10+
# in the background to mark nodes for reboot. It will first obtain
11+
# the current node state information to avoid rebooting a node that
12+
# is already offline or in maintenance. If these checks pass, the
13+
# node is marked for reboot.
14+
15+
IGNORE_EMPTY_NOTE="${IGNORE_EMPTY_NOTE:-0}"
16+
LEADER="NHC:"
17+
18+
echo "`date '+%Y%m%d %H:%M:%S'` $0 $*"
19+
20+
HOSTNAME="$1"
21+
shift
22+
NOTE="$*"
23+
24+
### SLURM
25+
if [[ "$NHC_RM" == "slurm" ]]; then
26+
SLURM_SINFO="${SLURM_SINFO:-sinfo}"
27+
SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}"
28+
SLURM_SC_OFFLINE_ARGS="${SLURM_SC_OFFLINE_ARGS:-reboot ASAP}"
29+
30+
LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) )
31+
STATUS="${LINE[0]}"
32+
case "$STATUS" in
33+
alloc*|comp*|idle*|mix*|resume*|resv*|undrain*)
34+
echo "$0: Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE"
35+
exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS $HOSTNAME
36+
;;
37+
down*|drain*|drng*|fail*|maint*)
38+
echo "$0: Not changing state of down node $HOSTNAME."
39+
;;
40+
*) echo "$0: Not sure how to handle node state \"$STATUS\" on $HOSTNAME" ;;
41+
esac
42+
43+
### Everything else is unsupported.
44+
else
45+
echo "$0: Unsupported RM detected in $0: \"$NHC_RM\""
46+
exit -1
47+
fi
48+
exit 0

0 commit comments

Comments
 (0)