Skip to content

Commit 7486112

Browse files
author
Martijn Kruiten
committed
Update to newest version that we use internally
1 parent 59cda92 commit 7486112

File tree

1 file changed

+29
-10
lines changed

1 file changed

+29
-10
lines changed

helpers/node-mark-reboot

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@
66
# 26 apr 2018
77
#
88

9-
# This script is a simple wrapper that the node health check can run
10-
# in the background to mark nodes for reboot. It will first obtain
11-
# the current node state information to avoid rebooting a node that
12-
# is already offline or in maintenance. If these checks pass, the
13-
# node is marked for reboot.
9+
# This script is a simple pbsnodes wrapper that the node health check
10+
# can run in the background to mark nodes for reboot. It will first
11+
# obtain the current node state information to avoid rebooting a node
12+
# that is already offline or in maintenance. If these checks pass,
13+
# the node is marked for reboot.
1414

15-
IGNORE_EMPTY_NOTE="${IGNORE_EMPTY_NOTE:-0}"
15+
FORCE_REBOOT="${FORCE_REBOOT:-0}"
1616
LEADER="NHC:"
1717

1818
echo "`date '+%Y%m%d %H:%M:%S'` $0 $*"
@@ -25,16 +25,34 @@ NOTE="$*"
2525
if [[ "$NHC_RM" == "slurm" ]]; then
2626
SLURM_SINFO="${SLURM_SINFO:-sinfo}"
2727
SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}"
28-
SLURM_SC_OFFLINE_ARGS="${SLURM_SC_OFFLINE_ARGS:-reboot ASAP}"
28+
SLURM_SC_REBOOT_ARGS="${SLURM_SC_REBOOT_ARGS:-reboot ASAP NextState=RESUME}"
2929

3030
LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) )
3131
STATUS="${LINE[0]}"
32+
OLD_NOTE_LEADER="${LINE[1]}"
33+
OLD_NOTE="${LINE[*]:2}"
3234
case "$STATUS" in
33-
alloc*|comp*|idle*|mix*|resume*|resv*|undrain*)
35+
alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*)
36+
case "$STATUS" in
37+
drain*|drng*|fail*|maint*)
38+
# If the node is already offline, and we've not been told to ignore that,
39+
# do not touch the node.
40+
if [[ "$OLD_NOTE_LEADER" != "$LEADER" && "$FORCE_REBOOT" != "1" ]]; then
41+
echo "$0: Not rebooting $HOSTNAME: Already offline."
42+
exit 0
43+
fi
44+
;;
45+
esac
46+
# If there's an old note that wasn't set by NHC, preserve it.
47+
if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
48+
LEADER="$OLD_NOTE_LEADER"
49+
NOTE="$OLD_NOTE"
50+
SLURM_SC_REBOOT_ARGS="reboot ASAP NextState=DOWN"
51+
fi
3452
echo "$0: Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE"
35-
exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS $HOSTNAME
53+
exec $SLURM_SCONTROL $SLURM_SC_REBOOT_ARGS Reason="$LEADER $NOTE" $HOSTNAME
3654
;;
37-
down*|drain*|drng*|fail*|maint*)
55+
down*)
3856
echo "$0: Not changing state of down node $HOSTNAME."
3957
;;
4058
*) echo "$0: Not sure how to handle node state \"$STATUS\" on $HOSTNAME" ;;
@@ -46,3 +64,4 @@ else
4664
exit -1
4765
fi
4866
exit 0
67+

0 commit comments

Comments
 (0)