Skip to content

Commit 1d38bb9

Browse files
author
Michael Jennings
committed
At the request/suggestion of Matt McLean <[email protected]>, I added 2
new flags to check_ps_service() that allow the user to request that the actions to be taken, whether that's start/restart/cycle/-e or stop/kill/-E, be verified by NHC, and that the check should only fail if the action isn't successful. The -v or "Verify Sync" flag causes NHC to wait on the requested action to complete, whatever that action may be, and check the exit code of the action. If it returns success, the check will pass. The check will only fail if the action fails. For example, the following will cause NHC to restart the named service if it's not running, and the check will only fail if "/sbin/service named restart" returns non-zero: check_ps_service -v -r named The -V or "Verify Check" option will do the same steps outlined above for -v but will additionally check to make sure that the expected result of the action actually occurred; i.e., that the service is subsequently running or not running, depending on the parameters of the check. For example, the following will kill any non-root sshd found running on the system, *and* make sure the kill command succeeded, *and* make sure afterward that the process has actually gone away (and fail the check if, and only if, the process still exists): check_ps_service -V -k -u !root sshd These changes are currently on branch service-restart-sync but will be merged into master after additional testing. Those wishing to test in the interim can build from this branch.
1 parent f2c9631 commit 1d38bb9

File tree

2 files changed

+87
-37
lines changed

2 files changed

+87
-37
lines changed

scripts/lbnl_ps.nhc

Lines changed: 83 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ function nhc_ps_gather_data() {
2525
local IFS PS_DATA THIS_PID i
2626
local -a LINES LINE
2727

28+
PS_PROCS=( ) PS_USER=( ) PS_PPID=( ) PS_PCPU=( ) PS_PMEM=( ) PS_RSS=( ) PS_VSZ=( ) PS_TIME=( ) PS_ARGS=( )
29+
2830
# We need passwd data to resolve UIDs for users with lengthy userids
2931
if [[ ${#PWDATA_USERS[*]} -eq 0 ]]; then
3032
nhc_common_load_passwd
@@ -382,29 +384,29 @@ function check_ps_blacklist() {
382384
# check_ps_service [-0] [-f] [-v|-V] [-S|-r|-c|-s|-k] [-u <user>] [-d <daemon> | -m <match>] [ -e <action> | -E <action> ] <service>
383385
function check_ps_service() {
384386
local SERVICE OWNER MATCH DAEMON NONFATAL=0 FULLMATCH=0 RESTART=0 CYCLE=0 START=0 STOP=0 KILL=0 VERIFY_SYNC=0 VERIFY_CHECK=0
385-
local ACTION FOUND_ACTION THIS_PID THIS_SVC i MSG RET
387+
local ACTION FOUND_ACTION THIS_PID THIS_SVC i MSG RET CMD
386388
local -a ARGS
387389

388390
if [[ ${#PS_PROCS[*]} -eq 0 ]]; then
389391
nhc_ps_gather_data
390392
fi
391393

392394
OPTIND=1
393-
while getopts ":0VSfrcskvu:d:m:e:E:" OPTION ; do
395+
while getopts ":0E:SVcd:e:fkm:rsu:v" OPTION ; do
394396
case "$OPTION" in
395397
0) NONFATAL=1 ;;
398+
E) FOUND_ACTION="$OPTARG" ;;
396399
S) START=1 ;;
397400
V) VERIFY_CHECK=1 ;;
401+
c) CYCLE=1 ;;
402+
d) DAEMON="$OPTARG" ;;
403+
e) ACTION="$OPTARG" ;;
398404
f) FULLMATCH=1 ;;
405+
k) KILL=1 ;;
406+
m) MATCH="$OPTARG" ;;
399407
r) RESTART=1 ;;
400-
c) CYCLE=1 ;;
401408
s) STOP=1 ;;
402-
k) KILL=1 ;;
403409
u) OWNER="$OPTARG" ;;
404-
d) DAEMON="$OPTARG" ;;
405-
m) MATCH="$OPTARG" ;;
406-
e) ACTION="$OPTARG" ;;
407-
E) FOUND_ACTION="$OPTARG" ;;
408410
v) VERIFY_SYNC=1 ;;
409411
:) die 1 "$FUNCNAME: Option -$OPTARG requires an argument." ; return 1 ;;
410412
\?) die 1 "$FUNCNAME: Invalid option: -$OPTARG" ; return 1 ;;
@@ -446,14 +448,13 @@ function check_ps_service() {
446448
fi
447449
fi
448450
# We have a matching process with the correct owner.
449-
if [[ "$FOUND_ACTION" != "" ]]; then
450-
${SHELL:-/bin/bash} -c "$FOUND_ACTION" &
451-
fi
452-
if [[ "$STOP" == "1" || "$KILL" == "1" ]]; then
451+
if [[ $STOP -eq 1 || $KILL -eq 1 || -n "$FOUND_ACTION" ]]; then
453452
# Logic is inverted; we DON'T want this process running, so finding it is a failure.
454453
MSG="$FUNCNAME: Service $SERVICE (process $DAEMON) ${OWNER:+owned by $OWNER }running"
455-
if [[ "$KILL" == "1" ]]; then
456-
if [[ "$SHELL" != ":" ]]; then
454+
if [[ $KILL -eq 1 ]]; then
455+
if [[ "$SHELL" == ":" ]]; then
456+
MSG="$MSG; killed process ID $THIS_PID (test mode)"
457+
else
457458
kill -9 $THIS_PID
458459
RET=$?
459460
if [[ $VERIFY_SYNC -eq 1 ]]; then
@@ -479,11 +480,8 @@ function check_ps_service() {
479480
else
480481
MSG="$MSG; killed process ID $THIS_PID (SIGKILL)"
481482
fi
482-
else
483-
MSG="$MSG; killed process ID $THIS_PID (test mode)"
484483
fi
485-
else
486-
# $STOP must be 1
484+
elif [[ $STOP -eq 1 ]]; then
487485
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE stop" &
488486
if [[ "$SHELL" == ":" ]]; then
489487
MSG="$MSG; termination in progress"
@@ -509,8 +507,35 @@ function check_ps_service() {
509507
else
510508
MSG="$MSG; service termination in progress"
511509
fi
510+
else
511+
# We must have a $FOUND_ACTION to run.
512+
${SHELL:-/bin/bash} -c "$FOUND_ACTION" &
513+
if [[ "$SHELL" == ":" ]]; then
514+
MSG="$MSG; \"$FOUND_ACTION\" in progress."
515+
elif [[ $VERIFY_SYNC -eq 1 || $VERIFY_CHECK -eq 1 ]]; then
516+
# In VERIFY mode, we must "foreground" the action to check its return value.
517+
wait $!
518+
RET=$?
519+
if [[ $RET -ne 0 ]]; then
520+
# If the action fails, both VERIFY modes do the same thing.
521+
MSG="$MSG failed (exit code $RET)."
522+
elif [[ $VERIFY_CHECK -eq 1 ]]; then
523+
# VERIFY_CHECK mode requires that we also make sure the PID is really gone now.
524+
if kill -0 $THIS_PID ; then
525+
MSG="$MSG succeeded but failed to terminate process $THIS_PID."
526+
else
527+
log "$MSG successfully terminated service $SERVICE (process $THIS_PID)."
528+
return 0
529+
fi
530+
else
531+
log "$MSG succeeded."
532+
return 0
533+
fi
534+
else
535+
MSG="$MSG; \"$FOUND_ACTION\" in progress."
536+
fi
512537
fi
513-
if [[ $NONFATAL == 1 ]]; then
538+
if [[ $NONFATAL -eq 1 ]]; then
514539
if [[ -n "$MSG" ]]; then
515540
log "$MSG (non-fatal)"
516541
fi
@@ -524,26 +549,51 @@ function check_ps_service() {
524549
done
525550

526551
# No matching process found.
527-
if [[ "$STOP" == "1" || "$KILL" == "1" ]]; then
552+
if [[ $STOP -eq 1 || $KILL -eq 1 || -n "$FOUND_ACTION" ]]; then
528553
# Logic is inverted; we DON'T want this process running, so not finding it is a success.
529554
return 0
530555
fi
531556

532557
MSG="$FUNCNAME: Service $SERVICE (process $DAEMON) ${OWNER:+owned by $OWNER }not running"
533-
if [[ $START == 1 ]]; then
534-
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE start" &
535-
MSG="$MSG; start in progress"
536-
elif [[ $RESTART == 1 ]]; then
537-
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE restart" &
538-
MSG="$MSG; restart in progress"
539-
elif [[ $CYCLE == 1 ]]; then
540-
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE stop ; sleep 2 ; /sbin/service $SERVICE start" &
541-
MSG="$MSG; cycle in progress"
542-
elif [[ "$ACTION" != "" ]]; then
543-
${SHELL:-/bin/bash} -c "$ACTION" &
544-
MSG="$MSG; executed \"$ACTION\""
558+
if [[ $START -eq 1 || $RESTART -eq 1 || $CYCLE -eq 1 || "$ACTION" != "" ]]; then
559+
if [[ $START -eq 1 ]]; then
560+
CMD="/sbin/service $SERVICE start"
561+
MSG="$MSG; start"
562+
elif [[ $RESTART -eq 1 ]]; then
563+
CMD="/sbin/service $SERVICE restart"
564+
MSG="$MSG; restart"
565+
elif [[ $CYCLE -eq 1 ]]; then
566+
CMD="/sbin/service $SERVICE stop ; sleep 2 ; /sbin/service $SERVICE start"
567+
MSG="$MSG; cycle"
568+
elif [[ "$ACTION" != "" ]]; then
569+
CMD="$ACTION"
570+
MSG="$MSG; \"$ACTION\""
571+
fi
572+
${SHELL:-/bin/bash} -c "$CMD" &
573+
if [[ $VERIFY_SYNC -eq 1 || $VERIFY_CHECK -eq 1 ]]; then
574+
wait $!
575+
RET=$?
576+
if [[ $RET -ne 0 ]]; then
577+
# If the command fails, both VERIFY modes do the same thing.
578+
MSG="$MSG failed (exit code $RET)."
579+
elif [[ $VERIFY_CHECK -eq 1 ]]; then
580+
# VERIFY_CHECK mode requires that we also make sure the process/service is now running.
581+
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE status" >&/dev/null
582+
if [[ $? -ne 0 ]]; then
583+
MSG="$MSG succeeded but failed to start service $SERVICE."
584+
else
585+
log "$MSG succeeded; service $SERVICE now running."
586+
return 0
587+
fi
588+
else
589+
log "$MSG; service $SERVICE stopped successfully."
590+
return 0
591+
fi
592+
else
593+
MSG="$MSG in progress"
594+
fi
545595
fi
546-
if [[ $NONFATAL == 1 ]]; then
596+
if [[ $NONFATAL -eq 1 ]]; then
547597
if [[ -n "$MSG" ]]; then
548598
log "$MSG (non-fatal)"
549599
fi

test/test_lbnl_ps.nhc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,13 +480,13 @@ plan $((14+10+6+29+18+6+5+7+6+6+9)) "lbnl_ps.nhc" && {
480480
check_ps_service -m 'sshd*' sshd
481481
is $? 1 "Service check with exact match glob (failure)"
482482
SHELL=: check_ps_service -e "/sbin/shutdown -r 1" trqauthd
483-
is $? 0 "Service check with missing action (success)"
483+
is $? 0 "Service check with missing action (daemon found -- success)"
484484
SHELL=: check_ps_service -e "/sbin/shutdown -r 1" httpd
485-
is $? 1 "Service check with missing action (failure)"
485+
is $? 1 "Service check with missing action (daemon not found -- failure)"
486486
SHELL=: check_ps_service -E "true" trqauthd
487-
is $? 0 "Service check with found action (success)"
487+
is $? 1 "Service check with found action (daemon found -- failure)"
488488
SHELL=: check_ps_service -E "true" httpd
489-
is $? 1 "Service check with found action (failure)"
489+
is $? 0 "Service check with found action (daemon not found -- success)"
490490

491491
# Checks for excessive CPU utilization
492492
check_ps_cpu 99

0 commit comments

Comments
 (0)