Skip to content

Commit d06279c

Browse files
author
Michael Jennings
committed
Merge branch 'service-restart-sync' into dev
* service-restart-sync: At the request/suggestion of Matt McLean <[email protected]>, I added 2 new flags to check_ps_service() that allow the user to request that the actions to be taken, whether that's start/restart/cycle/-e or stop/kill/-E, be verified by NHC, and that the check should only fail if the action isn't successful.
2 parents edc201d + 1d38bb9 commit d06279c

File tree

2 files changed

+141
-39
lines changed

2 files changed

+141
-39
lines changed

scripts/lbnl_ps.nhc

Lines changed: 137 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ function nhc_ps_gather_data() {
2525
local IFS PS_DATA THIS_PID i
2626
local -a LINES LINE
2727

28+
PS_PROCS=( ) PS_USER=( ) PS_PPID=( ) PS_PCPU=( ) PS_PMEM=( ) PS_RSS=( ) PS_VSZ=( ) PS_TIME=( ) PS_ARGS=( )
29+
2830
# We need passwd data to resolve UIDs for users with lengthy userids
2931
if [[ ${#PWDATA_USERS[*]} -eq 0 ]]; then
3032
nhc_common_load_passwd
@@ -379,31 +381,33 @@ function check_ps_blacklist() {
379381
}
380382

381383
# Check to make sure a service is (or isn't) running. Syntax:
382-
# check_ps_service [-0] [-f] [-S|-r|-c|-s|-k] [-u <user>] [-d <daemon> | -m <match>] [ -e <action> | -E <action> ] <service>
384+
# check_ps_service [-0] [-f] [-v|-V] [-S|-r|-c|-s|-k] [-u <user>] [-d <daemon> | -m <match>] [ -e <action> | -E <action> ] <service>
383385
function check_ps_service() {
384-
local SERVICE OWNER MATCH DAEMON NONFATAL=0 FULLMATCH=0 RESTART=0 CYCLE=0 START=0 STOP=0 KILL=0 ACTION FOUND_ACTION
385-
local THIS_PID THIS_SVC i MSG
386+
local SERVICE OWNER MATCH DAEMON NONFATAL=0 FULLMATCH=0 RESTART=0 CYCLE=0 START=0 STOP=0 KILL=0 VERIFY_SYNC=0 VERIFY_CHECK=0
387+
local ACTION FOUND_ACTION THIS_PID THIS_SVC i MSG RET CMD
386388
local -a ARGS
387389

388390
if [[ ${#PS_PROCS[*]} -eq 0 ]]; then
389391
nhc_ps_gather_data
390392
fi
391393

392394
OPTIND=1
393-
while getopts ":0Sfrcsku:d:m:e:E:" OPTION ; do
395+
while getopts ":0E:SVcd:e:fkm:rsu:v" OPTION ; do
394396
case "$OPTION" in
395397
0) NONFATAL=1 ;;
398+
E) FOUND_ACTION="$OPTARG" ;;
396399
S) START=1 ;;
400+
V) VERIFY_CHECK=1 ;;
401+
c) CYCLE=1 ;;
402+
d) DAEMON="$OPTARG" ;;
403+
e) ACTION="$OPTARG" ;;
397404
f) FULLMATCH=1 ;;
405+
k) KILL=1 ;;
406+
m) MATCH="$OPTARG" ;;
398407
r) RESTART=1 ;;
399-
c) CYCLE=1 ;;
400408
s) STOP=1 ;;
401-
k) KILL=1 ;;
402409
u) OWNER="$OPTARG" ;;
403-
d) DAEMON="$OPTARG" ;;
404-
m) MATCH="$OPTARG" ;;
405-
e) ACTION="$OPTARG" ;;
406-
E) FOUND_ACTION="$OPTARG" ;;
410+
v) VERIFY_SYNC=1 ;;
407411
:) die 1 "$FUNCNAME: Option -$OPTARG requires an argument." ; return 1 ;;
408412
\?) die 1 "$FUNCNAME: Invalid option: -$OPTARG" ; return 1 ;;
409413
esac
@@ -444,21 +448,94 @@ function check_ps_service() {
444448
fi
445449
fi
446450
# We have a matching process with the correct owner.
447-
if [[ "$FOUND_ACTION" != "" ]]; then
448-
${SHELL:-/bin/bash} -c "$FOUND_ACTION" &
449-
fi
450-
if [[ "$STOP" == "1" || "$KILL" == "1" ]]; then
451+
if [[ $STOP -eq 1 || $KILL -eq 1 || -n "$FOUND_ACTION" ]]; then
451452
# Logic is inverted; we DON'T want this process running, so finding it is a failure.
452453
MSG="$FUNCNAME: Service $SERVICE (process $DAEMON) ${OWNER:+owned by $OWNER }running"
453-
if [[ "$KILL" == "1" ]]; then
454-
[[ "$SHELL" != ":" ]] && kill -9 $THIS_PID
455-
MSG="$MSG; killed process ID $THIS_PID"
456-
else
457-
# $STOP must be 1
454+
if [[ $KILL -eq 1 ]]; then
455+
if [[ "$SHELL" == ":" ]]; then
456+
MSG="$MSG; killed process ID $THIS_PID (test mode)"
457+
else
458+
kill -9 $THIS_PID
459+
RET=$?
460+
if [[ $VERIFY_SYNC -eq 1 ]]; then
461+
# VERIFY_SYNC here only means we check the return value of the kill built-in.
462+
if [[ $RET -eq 0 ]]; then
463+
log "$MSG; process ID $THIS_PID killed successfully."
464+
continue
465+
else
466+
MSG="$MSG; \"kill -9 $THIS_PID\" failed (exit code $RET)."
467+
fi
468+
elif [[ $VERIFY_CHECK -eq 1 ]]; then
469+
# VERIFY_CHECK here means we kill the PID again and make sure it's gone.
470+
# Sleep very briefly to yield CPU, hopefully ensuring signal delivery.
471+
sleep 0.01
472+
if [[ $RET -ne 0 ]]; then
473+
MSG="$MSG; \"kill -9 $THIS_PID\" failed (exit code $RET)."
474+
elif kill -0 $THIS_PID ; then
475+
MSG="$MSG; \"kill -9 $THIS_PID\" succeeded but failed to terminate process."
476+
else
477+
log "$MSG; process ID $THIS_PID terminated successfully."
478+
return 0
479+
fi
480+
else
481+
MSG="$MSG; killed process ID $THIS_PID (SIGKILL)"
482+
fi
483+
fi
484+
elif [[ $STOP -eq 1 ]]; then
458485
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE stop" &
459-
MSG="$MSG; termination in progress"
486+
if [[ "$SHELL" == ":" ]]; then
487+
MSG="$MSG; termination in progress"
488+
elif [[ $VERIFY_SYNC -eq 1 || $VERIFY_CHECK -eq 1 ]]; then
489+
# In VERIFY mode, we must "foreground" the service action to check its return value.
490+
wait $!
491+
RET=$?
492+
if [[ $RET -ne 0 ]]; then
493+
# If the "stop" fails, both VERIFY modes do the same thing.
494+
MSG="$MSG; \"/sbin/service $SERVICE stop\" failed (exit code $RET)."
495+
elif [[ $VERIFY_CHECK -eq 1 ]]; then
496+
# VERIFY_CHECK mode requires that we also make sure the PID is really gone now.
497+
if kill -0 $THIS_PID ; then
498+
MSG="$MSG; \"/sbin/service $SERVICE stop\" succeeded but failed to stop process $THIS_PID."
499+
else
500+
log "$MSG; service $SERVICE stopped and process $THIS_PID terminated successfully."
501+
return 0
502+
fi
503+
else
504+
log "$MSG; service $SERVICE stopped successfully."
505+
return 0
506+
fi
507+
else
508+
MSG="$MSG; service termination in progress"
509+
fi
510+
else
511+
# We must have a $FOUND_ACTION to run.
512+
${SHELL:-/bin/bash} -c "$FOUND_ACTION" &
513+
if [[ "$SHELL" == ":" ]]; then
514+
MSG="$MSG; \"$FOUND_ACTION\" in progress."
515+
elif [[ $VERIFY_SYNC -eq 1 || $VERIFY_CHECK -eq 1 ]]; then
516+
# In VERIFY mode, we must "foreground" the action to check its return value.
517+
wait $!
518+
RET=$?
519+
if [[ $RET -ne 0 ]]; then
520+
# If the action fails, both VERIFY modes do the same thing.
521+
MSG="$MSG failed (exit code $RET)."
522+
elif [[ $VERIFY_CHECK -eq 1 ]]; then
523+
# VERIFY_CHECK mode requires that we also make sure the PID is really gone now.
524+
if kill -0 $THIS_PID ; then
525+
MSG="$MSG succeeded but failed to terminate process $THIS_PID."
526+
else
527+
log "$MSG successfully terminated service $SERVICE (process $THIS_PID)."
528+
return 0
529+
fi
530+
else
531+
log "$MSG succeeded."
532+
return 0
533+
fi
534+
else
535+
MSG="$MSG; \"$FOUND_ACTION\" in progress."
536+
fi
460537
fi
461-
if [[ $NONFATAL == 1 ]]; then
538+
if [[ $NONFATAL -eq 1 ]]; then
462539
if [[ -n "$MSG" ]]; then
463540
log "$MSG (non-fatal)"
464541
fi
@@ -472,26 +549,51 @@ function check_ps_service() {
472549
done
473550

474551
# No matching process found.
475-
if [[ "$STOP" == "1" || "$KILL" == "1" ]]; then
552+
if [[ $STOP -eq 1 || $KILL -eq 1 || -n "$FOUND_ACTION" ]]; then
476553
# Logic is inverted; we DON'T want this process running, so not finding it is a success.
477554
return 0
478555
fi
479556

480557
MSG="$FUNCNAME: Service $SERVICE (process $DAEMON) ${OWNER:+owned by $OWNER }not running"
481-
if [[ $START == 1 ]]; then
482-
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE start" &
483-
MSG="$MSG; start in progress"
484-
elif [[ $RESTART == 1 ]]; then
485-
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE restart" &
486-
MSG="$MSG; restart in progress"
487-
elif [[ $CYCLE == 1 ]]; then
488-
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE stop ; sleep 2 ; /sbin/service $SERVICE start" &
489-
MSG="$MSG; cycle in progress"
490-
elif [[ "$ACTION" != "" ]]; then
491-
${SHELL:-/bin/bash} -c "$ACTION" &
492-
MSG="$MSG; executed \"$ACTION\""
558+
if [[ $START -eq 1 || $RESTART -eq 1 || $CYCLE -eq 1 || "$ACTION" != "" ]]; then
559+
if [[ $START -eq 1 ]]; then
560+
CMD="/sbin/service $SERVICE start"
561+
MSG="$MSG; start"
562+
elif [[ $RESTART -eq 1 ]]; then
563+
CMD="/sbin/service $SERVICE restart"
564+
MSG="$MSG; restart"
565+
elif [[ $CYCLE -eq 1 ]]; then
566+
CMD="/sbin/service $SERVICE stop ; sleep 2 ; /sbin/service $SERVICE start"
567+
MSG="$MSG; cycle"
568+
elif [[ "$ACTION" != "" ]]; then
569+
CMD="$ACTION"
570+
MSG="$MSG; \"$ACTION\""
571+
fi
572+
${SHELL:-/bin/bash} -c "$CMD" &
573+
if [[ $VERIFY_SYNC -eq 1 || $VERIFY_CHECK -eq 1 ]]; then
574+
wait $!
575+
RET=$?
576+
if [[ $RET -ne 0 ]]; then
577+
# If the command fails, both VERIFY modes do the same thing.
578+
MSG="$MSG failed (exit code $RET)."
579+
elif [[ $VERIFY_CHECK -eq 1 ]]; then
580+
# VERIFY_CHECK mode requires that we also make sure the process/service is now running.
581+
${SHELL:-/bin/bash} -c "/sbin/service $SERVICE status" >&/dev/null
582+
if [[ $? -ne 0 ]]; then
583+
MSG="$MSG succeeded but failed to start service $SERVICE."
584+
else
585+
log "$MSG succeeded; service $SERVICE now running."
586+
return 0
587+
fi
588+
else
589+
log "$MSG; service $SERVICE stopped successfully."
590+
return 0
591+
fi
592+
else
593+
MSG="$MSG in progress"
594+
fi
493595
fi
494-
if [[ $NONFATAL == 1 ]]; then
596+
if [[ $NONFATAL -eq 1 ]]; then
495597
if [[ -n "$MSG" ]]; then
496598
log "$MSG (non-fatal)"
497599
fi

test/test_lbnl_ps.nhc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,13 +480,13 @@ plan $((14+10+6+29+18+6+5+7+6+6+9)) "lbnl_ps.nhc" && {
480480
check_ps_service -m 'sshd*' sshd
481481
is $? 1 "Service check with exact match glob (failure)"
482482
SHELL=: check_ps_service -e "/sbin/shutdown -r 1" trqauthd
483-
is $? 0 "Service check with missing action (success)"
483+
is $? 0 "Service check with missing action (daemon found -- success)"
484484
SHELL=: check_ps_service -e "/sbin/shutdown -r 1" httpd
485-
is $? 1 "Service check with missing action (failure)"
485+
is $? 1 "Service check with missing action (daemon not found -- failure)"
486486
SHELL=: check_ps_service -E "true" trqauthd
487-
is $? 0 "Service check with found action (success)"
487+
is $? 1 "Service check with found action (daemon found -- failure)"
488488
SHELL=: check_ps_service -E "true" httpd
489-
is $? 1 "Service check with found action (failure)"
489+
is $? 0 "Service check with found action (daemon not found -- success)"
490490

491491
# Checks for excessive CPU utilization
492492
check_ps_cpu 99

0 commit comments

Comments
 (0)