66# 26 apr 2018
77#
88
9- # This script is a simple wrapper that the node health check can run
10- # in the background to mark nodes for reboot. It will first obtain
11- # the current node state information to avoid rebooting a node that
12- # is already offline or in maintenance. If these checks pass, the
13- # node is marked for reboot.
9+ # This script is a simple pbsnodes wrapper that the node health check
10+ # can run in the background to mark nodes for reboot. It will first
11+ # obtain the current node state information to avoid rebooting a node
12+ # that is already offline or in maintenance. If these checks pass,
13+ # the node is marked for reboot.
1414
15- IGNORE_EMPTY_NOTE =" ${IGNORE_EMPTY_NOTE :- 0} "
15+ FORCE_REBOOT =" ${FORCE_REBOOT :- 0} "
1616LEADER=" NHC:"
1717
1818echo " ` date ' +%Y%m%d %H:%M:%S' ` $0 $* "
@@ -25,16 +25,34 @@ NOTE="$*"
2525if [[ " $NHC_RM " == " slurm" ]]; then
2626 SLURM_SINFO=" ${SLURM_SINFO:- sinfo} "
2727 SLURM_SCONTROL=" ${SLURM_SCONTROL:- scontrol} "
28- SLURM_SC_OFFLINE_ARGS =" ${SLURM_SC_OFFLINE_ARGS :- reboot ASAP} "
28+ SLURM_SC_REBOOT_ARGS =" ${SLURM_SC_REBOOT_ARGS :- reboot ASAP NextState=RESUME } "
2929
3030 LINE=( $( $SLURM_SINFO -o ' %t %E' -hn $HOSTNAME ) )
3131 STATUS=" ${LINE[0]} "
32+ OLD_NOTE_LEADER=" ${LINE[1]} "
33+ OLD_NOTE=" ${LINE[*]: 2} "
3234 case " $STATUS " in
33- alloc* |comp* |idle* |mix* |resume* |resv* |undrain* )
35+ alloc* |comp* |drain* |drng* |fail* |idle* |maint* |mix* |resume* |resv* |undrain* )
36+ case " $STATUS " in
37+ drain* |drng* |fail* |maint* )
38+ # If the node is already offline, and we've not been told to ignore that,
39+ # do not touch the node.
40+ if [[ " $OLD_NOTE_LEADER " != " $LEADER " && " $FORCE_REBOOT " != " 1" ]]; then
41+ echo " $0 : Not rebooting $HOSTNAME : Already offline."
42+ exit 0
43+ fi
44+ ;;
45+ esac
46+ # If there's an old note that wasn't set by NHC, preserve it.
47+ if [[ " $OLD_NOTE_LEADER " != " none" && " $OLD_NOTE_LEADER " != " $LEADER " ]]; then
48+ LEADER=" $OLD_NOTE_LEADER "
49+ NOTE=" $OLD_NOTE "
50+ SLURM_SC_REBOOT_ARGS=" reboot ASAP NextState=DOWN"
51+ fi
3452 echo " $0 : Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE "
35- exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS $HOSTNAME
53+ exec $SLURM_SCONTROL $SLURM_SC_REBOOT_ARGS Reason= " $LEADER $NOTE " $HOSTNAME
3654 ;;
37- down* |drain * |drng * |fail * |maint * )
55+ down* )
3856 echo " $0 : Not changing state of down node $HOSTNAME ."
3957 ;;
4058 * ) echo " $0 : Not sure how to handle node state \" $STATUS \" on $HOSTNAME " ;;
4664 exit -1
4765fi
4866exit 0
67+
0 commit comments