-
Notifications
You must be signed in to change notification settings - Fork 87
Open
Description
If NHC is used as an SGE load sensor with syslogging, it currently spams syslog with a message on each run until the problem is resolved. This change avoids sending messages when the state hasn't changed.
diff --git a/nhc b/nhc
index 1705e79..706c07d 100755
--- a/nhc
+++ b/nhc
@@ -40,6 +40,10 @@
### Library functions
+# Cache for the last message to avoid spamming syslog in the SGE loop
+# until the state changes.
+last_died_msg=
+
# Declare a print-error-and-exit function.
function die() {
IFS=$' \t\n'
@@ -48,8 +52,11 @@ function die() {
CHECK_DIED=1
log "ERROR: $NAME: Health check failed: $*"
- syslog "Health check failed: $*"
- syslog_flush
+ if [[ "$NHC_RM" != "sge" || "$*" != "$last_died_msg" ]]; then
+ last_died_msg="$*"
+ syslog "Health check failed: $*"
+ syslog_flush
+ fi
if [[ -n "$NHC_RM" && "$MARK_OFFLINE" -eq 1 && "$FAIL_CNT" -eq 0 ]]; then
eval $OFFLINE_NODE "'$HOSTNAME'" "'$*'"
fi
@@ -628,6 +635,10 @@ function nhcmain_mark_online() {
function nhcmain_finish() {
local ELAPSED
+ if [[ -n "$last_died_msg" ]]; then
+ syslog "Health check recovered"
+ last_died_msg=
+ fi
syslog_flush
ELAPSED=$((SECONDS-NHC_START_TS))
vlog "Node Health Check completed successfully (${ELAPSED}s)."