11#! /bin/bash
2+ # 1. Runs as root via systemd service, so direct access to kubelet.conf is available; sudo is not required
3+ # 2. Use kubectl to get pods and delete pods with retry
4+ # 3. start/stop/restart are NON-BLOCKING
5+ # 4. Only target pods matching POD_SELECTOR (default: raw_container_name=telemetry)
6+
27set -euo pipefail
38
4- SERVICE=" telemetry"
5- NS=" ${NS:- sonic} " # k8s namespace
6- LABEL=" raw_container_name=${SERVICE} " # selector used by DaemonSet
7- KUBECTL_BIN=" ${KUBECTL_BIN:- kubectl} "
8- NODE_NAME=" ${NODE_NAME:- $(hostname)} "
9- DEV=" ${2:- } " # accepted for compatibility; unused (single-ASIC)
9+ NS=" sonic"
10+ KUBECTL_BIN=" /usr/bin/kubectl"
11+ KCF=(--kubeconfig=/etc/kubernetes/kubelet.conf)
12+ REQ_TIMEOUT=" 5s"
13+ MAX_ATTEMPTS=10
14+ BACKOFF_START=1
15+ BACKOFF_MAX=8
1016
11- log () { /usr/bin/logger -t " ${SERVICE} #system" " $* " ; }
17+ # Label selector for telemetry pods; can be overridden via env
18+ # Example override: POD_SELECTOR="app=telemetry" telemetry.sh start
19+ POD_SELECTOR=" ${POD_SELECTOR:- raw_container_name=telemetry} "
1220
13- require_kubectl () {
14- if ! command -v " ${KUBECTL_BIN} " > /dev/null 2>&1 ; then
15- echo " ERROR: kubectl not found (KUBECTL_BIN=${KUBECTL_BIN} )." >&2
16- exit 127
17- fi
18- # Try a sensible default if KUBECONFIG isn’t set
19- if [[ -z " ${KUBECONFIG:- } " && -r /etc/kubernetes/kubelet.conf ]]; then
20- export KUBECONFIG=/etc/kubernetes/kubelet.conf
21- fi
21+ NODE_NAME=" $( hostname | tr ' [:upper:]' ' [:lower:]' ) "
22+ log () { /usr/bin/logger -t " k8s-podctl#system" " $* " ; }
23+
24+ kubectl_retry () {
25+ local attempt=1 backoff=${BACKOFF_START} out rc
26+ while true ; do
27+ out=" $( " ${KUBECTL_BIN} " " ${KCF[@]} " --request-timeout=" ${REQ_TIMEOUT} " " $@ " 2>&1 ) " ; rc=$?
28+ if (( rc == 0 )) ; then
29+ printf ' %s' " $out "
30+ return 0
31+ fi
32+ if (( attempt >= MAX_ATTEMPTS )) ; then
33+ echo " $out " >&2
34+ return " $rc "
35+ fi
36+ log " kubectl retry ${attempt} /${MAX_ATTEMPTS} for: $* "
37+ sleep " ${backoff} "
38+ (( backoff = backoff < BACKOFF_MAX ? backoff* 2 : BACKOFF_MAX ))
39+ (( attempt++ ))
40+ done
2241}
2342
2443pods_on_node () {
25- # Prints: "<name> <phase>" per line for this node
26- " ${KUBECTL_BIN} " -n " ${NS} " get pods \
27- -l " ${LABEL} " \
44+ kubectl_retry -n " ${NS} " get pods \
2845 --field-selector " spec.nodeName=${NODE_NAME} " \
29- -o jsonpath=' {range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}' 2> /dev/null || true
46+ -l " ${POD_SELECTOR} " \
47+ -o jsonpath=' {range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}' || true
48+ }
49+
50+ pod_names_on_node () {
51+ kubectl_retry -n " ${NS} " get pods \
52+ --field-selector " spec.nodeName=${NODE_NAME} " \
53+ -l " ${POD_SELECTOR} " \
54+ -o jsonpath=' {range .items[*]}{.metadata.name}{"\n"}{end}' || true
55+ }
56+
57+ delete_pod_with_retry () {
58+ local name=" $1 "
59+ local out rc
60+ out=$( kubectl_retry -n " ${NS} " delete pod " ${name} " --force --grace-period=0 --wait=false 2>&1 )
61+ rc=$?
62+ if (( rc != 0 )) ; then
63+ log " ERROR delete pod '${name} ' failed rc=${rc} : ${out} "
64+ else
65+ log " Deleted pod '${name} '"
66+ fi
67+ return " $rc "
3068}
3169
3270kill_pods () {
33- require_kubectl
34- local found=0
35- while read -r name phase; do
36- [[ -z " ${name} " ]] && continue
37- found=1
38- log " Deleting ${SERVICE} pod ${name} (phase=${phase} ) on node ${NODE_NAME} "
39- # Force/instant delete to emulate “kill”; DaemonSet will recreate
40- " ${KUBECTL_BIN} " -n " ${NS} " delete pod " ${name} " --grace-period=0 --force > /dev/null 2>&1 || true
41- done < <( pods_on_node)
42- if [[ " ${found} " -eq 0 ]]; then
43- log " No ${SERVICE} pods found on node ${NODE_NAME} (namespace=${NS} , label=${LABEL} )."
71+ mapfile -t names < <( pod_names_on_node)
72+ if (( ${# names[@]} == 0 )) ; then
73+ log " No pods found on ${NODE_NAME} (ns=${NS} , selector=${POD_SELECTOR} )."
74+ return 0
75+ fi
76+
77+ log " Deleting pods on ${NODE_NAME} (ns=${NS} , selector=${POD_SELECTOR} ): ${names[*]} "
78+
79+ local rc_any=0
80+ for p in " ${names[@]} " ; do
81+ [[ -z " $p " ]] && continue
82+ if ! delete_pod_with_retry " $p " ; then
83+ rc_any=1
84+ fi
85+ done
86+
87+ if (( rc_any != 0 )) ; then
88+ log " ERROR one or more pod deletions failed on ${NODE_NAME} (selector=${POD_SELECTOR} )"
89+ else
90+ log " All targeted pods deleted on ${NODE_NAME} (selector=${POD_SELECTOR} )"
4491 fi
92+ return " $rc_any "
93+ }
94+
95+ cmd_start () {
96+ if command -v systemd-cat > /dev/null 2>&1 ; then
97+ # background + pipe to journald with distinct priorities
98+ ( kill_pods ) \
99+ > >( systemd-cat -t telemetry-start -p info) \
100+ 2> >( systemd-cat -t telemetry-start -p err)
101+ else
102+ # background + pipe to syslog via logger in case systemd-journald is masked/disabled
103+ ( kill_pods ) \
104+ > >( logger -t " telemetry-start" -p user.info) \
105+ 2> >( logger -t " telemetry-start" -p user.err)
106+ fi &
107+ disown
108+ exit 0
45109}
46110
47- cmd_start () { kill_pods; } # start == kill (DS restarts)
48111cmd_stop () { kill_pods; }
49112cmd_restart () { kill_pods; }
50113
51114cmd_status () {
52- require_kubectl
53- local out; out=" $( pods_on_node) "
54- if [[ -z " ${out} " ]]; then
55- echo " ${SERVICE} : NOT RUNNING (no pod on node ${NODE_NAME} )"
115+ local out=" " ; out=" $( pods_on_node) "
116+ if [[ -z " $out " ]]; then
117+ echo " NOT RUNNING (no pod on node ${NODE_NAME} with selector '${POD_SELECTOR} ')"
56118 exit 3
57119 fi
58- echo " ${out} " | while read -r name phase; do
59- [[ -z " ${name} " ]] && continue
60- echo " ${SERVICE} pod ${name} : ${phase} "
61- done
62- # Exit 0 if at least one Running, 1 otherwise
63- if echo " ${out} " | awk ' $2=="Running"{found=1} END{exit found?0:1}' ; then
120+ while read -r name phase; do
121+ [[ -z " $name " ]] && continue
122+ echo " pod ${name} : ${phase} "
123+ done <<< " $out"
124+ if awk ' $2=="Running"{found=1} END{exit found?0:1}' <<< " $out" ; then
64125 exit 0
65126 else
66127 exit 1
67128 fi
68129}
69130
70131cmd_wait () {
71- require_kubectl
72- log " Waiting on ${SERVICE} pods (ns=${NS} , label=${LABEL} ) on node ${NODE_NAME} ..."
73- # Keep the systemd service 'active' as long as at least one pod exists for this node.
132+ log " Waiting on pods (ns=${NS} , selector=${POD_SELECTOR} ) on node ${NODE_NAME} …"
74133 while true ; do
75- local out; out=" $( pods_on_node) "
76- if [[ -z " ${out} " ]]; then
77- # no pod presently; keep waiting (DaemonSet may bring it up)
78- sleep 5
79- continue
134+ local out=" " ; out=" $( pods_on_node) "
135+ if [[ -z " $out " ]]; then
136+ sleep 5; continue
80137 fi
81- # If at least one is Running, sleep longer; otherwise poll faster
82- if echo " ${out} " | awk ' $2=="Running"{found=1} END{exit found?0:1}' ; then
138+ if awk ' $2=="Running"{found=1} END{exit found?0:1}' <<< " $out" ; then
83139 sleep 60
84140 else
85141 sleep 5
@@ -94,7 +150,7 @@ case "${1:-}" in
94150 wait) cmd_wait ;;
95151 status) cmd_status ;;
96152 * )
97- echo " Usage: $0 {start|stop|restart|wait|status} [asic-id(optional, ignored)] " >&2
153+ echo " Usage: $0 {start|stop|restart|wait|status}" >&2
98154 exit 2
99155 ;;
100156esac
0 commit comments