File tree Expand file tree Collapse file tree 1 file changed +48
-0
lines changed Expand file tree Collapse file tree 1 file changed +48
-0
lines changed Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ #
3+ # SURFsara Node Health Check -- Node Rebooting Helper
4+ #
5+ # Martijn Kruiten <[email protected] >6+ # 26 apr 2018
7+ #
8+
9+ # This script is a simple wrapper that the node health check can run
10+ # in the background to mark nodes for reboot. It will first obtain
11+ # the current node state information to avoid rebooting a node that
12+ # is already offline or in maintenance. If these checks pass, the
13+ # node is marked for reboot.
14+
15+ IGNORE_EMPTY_NOTE=" ${IGNORE_EMPTY_NOTE:- 0} "
16+ LEADER=" NHC:"
17+
18+ echo " ` date ' +%Y%m%d %H:%M:%S' ` $0 $* "
19+
20+ HOSTNAME=" $1 "
21+ shift
22+ NOTE=" $* "
23+
24+ # ## SLURM
25+ if [[ " $NHC_RM " == " slurm" ]]; then
26+ SLURM_SINFO=" ${SLURM_SINFO:- sinfo} "
27+ SLURM_SCONTROL=" ${SLURM_SCONTROL:- scontrol} "
28+ SLURM_SC_OFFLINE_ARGS=" ${SLURM_SC_OFFLINE_ARGS:- reboot ASAP} "
29+
30+ LINE=( $( $SLURM_SINFO -o ' %t %E' -hn $HOSTNAME ) )
31+ STATUS=" ${LINE[0]} "
32+ case " $STATUS " in
33+ alloc* |comp* |idle* |mix* |resume* |resv* |undrain* )
34+ echo " $0 : Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE "
35+ exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS $HOSTNAME
36+ ;;
37+ down* |drain* |drng* |fail* |maint* )
38+ echo " $0 : Not changing state of down node $HOSTNAME ."
39+ ;;
40+ * ) echo " $0 : Not sure how to handle node state \" $STATUS \" on $HOSTNAME " ;;
41+ esac
42+
43+ # ## Everything else is unsupported.
44+ else
45+ echo " $0 : Unsupported RM detected in $0 : \" $NHC_RM \" "
46+ exit -1
47+ fi
48+ exit 0
You can’t perform that action at this time.
0 commit comments