Skip to content

Commit fbfd224

Browse files
Enable kubectl execution in sidecar (#24155)
Enable kubectl execution in sidecar
1 parent f4f202d commit fbfd224

File tree

5 files changed

+185
-83
lines changed

5 files changed

+185
-83
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[Unit]
2+
Description=Telemetry container
3+
Requires=database.service
4+
After=database.service swss.service syncd.service
5+
BindsTo=sonic.target
6+
After=sonic.target
7+
StartLimitIntervalSec=1200
8+
StartLimitBurst=3
9+
10+
[Service]
11+
Type=simple
12+
User=root
13+
ExecStartPre=/usr/local/bin/telemetry.sh start # start: now returns in non-blocking (fire-and-forget pod deletion)
14+
ExecStart=/usr/local/bin/telemetry.sh wait # wait: long-lived loop that observes pod status
15+
ExecStop=/usr/local/bin/telemetry.sh stop # stop will not be working after kubesonic since pod will be auto-deployed via kubernetes
16+
RestartSec=30
17+
TimeoutStartSec=30s
18+
TimeoutStopSec=30s
19+
Restart=always
Lines changed: 109 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,141 @@
11
#!/bin/bash
2+
# 1. Runs as root via systemd service, so direct access to kubelet.conf is available; sudo is not required
3+
# 2. Use kubectl to get pods and delete pods with retry
4+
# 3. start/stop/restart are NON-BLOCKING
5+
# 4. Only target pods matching POD_SELECTOR (default: raw_container_name=telemetry)
6+
27
set -euo pipefail
38

4-
SERVICE="telemetry"
5-
NS="${NS:-sonic}" # k8s namespace
6-
LABEL="raw_container_name=${SERVICE}" # selector used by DaemonSet
7-
KUBECTL_BIN="${KUBECTL_BIN:-kubectl}"
8-
NODE_NAME="${NODE_NAME:-$(hostname)}"
9-
DEV="${2:-}" # accepted for compatibility; unused (single-ASIC)
9+
NS="sonic"
10+
KUBECTL_BIN="/usr/bin/kubectl"
11+
KCF=(--kubeconfig=/etc/kubernetes/kubelet.conf)
12+
REQ_TIMEOUT="5s"
13+
MAX_ATTEMPTS=10
14+
BACKOFF_START=1
15+
BACKOFF_MAX=8
1016

11-
log() { /usr/bin/logger -t "${SERVICE}#system" "$*"; }
17+
# Label selector for telemetry pods; can be overridden via env
18+
# Example override: POD_SELECTOR="app=telemetry" telemetry.sh start
19+
POD_SELECTOR="${POD_SELECTOR:-raw_container_name=telemetry}"
1220

13-
require_kubectl() {
14-
if ! command -v "${KUBECTL_BIN}" >/dev/null 2>&1; then
15-
echo "ERROR: kubectl not found (KUBECTL_BIN=${KUBECTL_BIN})." >&2
16-
exit 127
17-
fi
18-
# Try a sensible default if KUBECONFIG isn’t set
19-
if [[ -z "${KUBECONFIG:-}" && -r /etc/kubernetes/kubelet.conf ]]; then
20-
export KUBECONFIG=/etc/kubernetes/kubelet.conf
21-
fi
21+
NODE_NAME="$(hostname | tr '[:upper:]' '[:lower:]')"
22+
log() { /usr/bin/logger -t "k8s-podctl#system" "$*"; }
23+
24+
kubectl_retry() {
25+
local attempt=1 backoff=${BACKOFF_START} out rc
26+
while true; do
27+
out="$("${KUBECTL_BIN}" "${KCF[@]}" --request-timeout="${REQ_TIMEOUT}" "$@" 2>&1)"; rc=$?
28+
if (( rc == 0 )); then
29+
printf '%s' "$out"
30+
return 0
31+
fi
32+
if (( attempt >= MAX_ATTEMPTS )); then
33+
echo "$out" >&2
34+
return "$rc"
35+
fi
36+
log "kubectl retry ${attempt}/${MAX_ATTEMPTS} for: $*"
37+
sleep "${backoff}"
38+
(( backoff = backoff < BACKOFF_MAX ? backoff*2 : BACKOFF_MAX ))
39+
(( attempt++ ))
40+
done
2241
}
2342

2443
pods_on_node() {
25-
# Prints: "<name> <phase>" per line for this node
26-
"${KUBECTL_BIN}" -n "${NS}" get pods \
27-
-l "${LABEL}" \
44+
kubectl_retry -n "${NS}" get pods \
2845
--field-selector "spec.nodeName=${NODE_NAME}" \
29-
-o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}' 2>/dev/null || true
46+
-l "${POD_SELECTOR}" \
47+
-o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}' || true
48+
}
49+
50+
pod_names_on_node() {
51+
kubectl_retry -n "${NS}" get pods \
52+
--field-selector "spec.nodeName=${NODE_NAME}" \
53+
-l "${POD_SELECTOR}" \
54+
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true
55+
}
56+
57+
delete_pod_with_retry() {
58+
local name="$1"
59+
local out rc
60+
out=$(kubectl_retry -n "${NS}" delete pod "${name}" --force --grace-period=0 --wait=false 2>&1)
61+
rc=$?
62+
if (( rc != 0 )); then
63+
log "ERROR delete pod '${name}' failed rc=${rc}: ${out}"
64+
else
65+
log "Deleted pod '${name}'"
66+
fi
67+
return "$rc"
3068
}
3169

3270
kill_pods() {
33-
require_kubectl
34-
local found=0
35-
while read -r name phase; do
36-
[[ -z "${name}" ]] && continue
37-
found=1
38-
log "Deleting ${SERVICE} pod ${name} (phase=${phase}) on node ${NODE_NAME}"
39-
# Force/instant delete to emulate “kill”; DaemonSet will recreate
40-
"${KUBECTL_BIN}" -n "${NS}" delete pod "${name}" --grace-period=0 --force >/dev/null 2>&1 || true
41-
done < <(pods_on_node)
42-
if [[ "${found}" -eq 0 ]]; then
43-
log "No ${SERVICE} pods found on node ${NODE_NAME} (namespace=${NS}, label=${LABEL})."
71+
mapfile -t names < <(pod_names_on_node)
72+
if (( ${#names[@]} == 0 )); then
73+
log "No pods found on ${NODE_NAME} (ns=${NS}, selector=${POD_SELECTOR})."
74+
return 0
75+
fi
76+
77+
log "Deleting pods on ${NODE_NAME} (ns=${NS}, selector=${POD_SELECTOR}): ${names[*]}"
78+
79+
local rc_any=0
80+
for p in "${names[@]}"; do
81+
[[ -z "$p" ]] && continue
82+
if ! delete_pod_with_retry "$p"; then
83+
rc_any=1
84+
fi
85+
done
86+
87+
if (( rc_any != 0 )); then
88+
log "ERROR one or more pod deletions failed on ${NODE_NAME} (selector=${POD_SELECTOR})"
89+
else
90+
log "All targeted pods deleted on ${NODE_NAME} (selector=${POD_SELECTOR})"
4491
fi
92+
return "$rc_any"
93+
}
94+
95+
cmd_start() {
96+
if command -v systemd-cat >/dev/null 2>&1; then
97+
# background + pipe to journald with distinct priorities
98+
( kill_pods ) \
99+
> >(systemd-cat -t telemetry-start -p info) \
100+
2> >(systemd-cat -t telemetry-start -p err)
101+
else
102+
# background + pipe to syslog via logger in case systemd-journald is masked/disabled
103+
( kill_pods ) \
104+
> >(logger -t "telemetry-start" -p user.info) \
105+
2> >(logger -t "telemetry-start" -p user.err)
106+
fi &
107+
disown
108+
exit 0
45109
}
46110

47-
cmd_start() { kill_pods; } # start == kill (DS restarts)
48111
cmd_stop() { kill_pods; }
49112
cmd_restart() { kill_pods; }
50113

51114
cmd_status() {
52-
require_kubectl
53-
local out; out="$(pods_on_node)"
54-
if [[ -z "${out}" ]]; then
55-
echo "${SERVICE}: NOT RUNNING (no pod on node ${NODE_NAME})"
115+
local out=""; out="$(pods_on_node)"
116+
if [[ -z "$out" ]]; then
117+
echo "NOT RUNNING (no pod on node ${NODE_NAME} with selector '${POD_SELECTOR}')"
56118
exit 3
57119
fi
58-
echo "${out}" | while read -r name phase; do
59-
[[ -z "${name}" ]] && continue
60-
echo "${SERVICE} pod ${name}: ${phase}"
61-
done
62-
# Exit 0 if at least one Running, 1 otherwise
63-
if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then
120+
while read -r name phase; do
121+
[[ -z "$name" ]] && continue
122+
echo "pod ${name}: ${phase}"
123+
done <<<"$out"
124+
if awk '$2=="Running"{found=1} END{exit found?0:1}' <<<"$out"; then
64125
exit 0
65126
else
66127
exit 1
67128
fi
68129
}
69130

70131
cmd_wait() {
71-
require_kubectl
72-
log "Waiting on ${SERVICE} pods (ns=${NS}, label=${LABEL}) on node ${NODE_NAME}..."
73-
# Keep the systemd service 'active' as long as at least one pod exists for this node.
132+
log "Waiting on pods (ns=${NS}, selector=${POD_SELECTOR}) on node ${NODE_NAME}"
74133
while true; do
75-
local out; out="$(pods_on_node)"
76-
if [[ -z "${out}" ]]; then
77-
# no pod presently; keep waiting (DaemonSet may bring it up)
78-
sleep 5
79-
continue
134+
local out=""; out="$(pods_on_node)"
135+
if [[ -z "$out" ]]; then
136+
sleep 5; continue
80137
fi
81-
# If at least one is Running, sleep longer; otherwise poll faster
82-
if echo "${out}" | awk '$2=="Running"{found=1} END{exit found?0:1}'; then
138+
if awk '$2=="Running"{found=1} END{exit found?0:1}' <<<"$out"; then
83139
sleep 60
84140
else
85141
sleep 5
@@ -94,7 +150,7 @@ case "${1:-}" in
94150
wait) cmd_wait ;;
95151
status) cmd_status ;;
96152
*)
97-
echo "Usage: $0 {start|stop|restart|wait|status} [asic-id(optional, ignored)]" >&2
153+
echo "Usage: $0 {start|stop|restart|wait|status}" >&2
98154
exit 2
99155
;;
100156
esac

dockers/docker-telemetry-sidecar/systemd_scripts/tests/test_systemd_stub.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def fake_run_nsenter(args, *, text=True, input_bytes=None):
7878
target = args[-1]
7979
host_fs.pop(target, None)
8080
return 0, "" if text else b"", "" if text else b""
81-
# sudo …
81+
# sudo … (allow anything)
8282
if args[:1] == ["sudo"]:
8383
return 0, "" if text else b"", "" if text else b""
8484
return 1, "" if text else b"", "unsupported" if text else b"unsupported"
@@ -87,6 +87,7 @@ def fake_run_nsenter(args, *, text=True, input_bytes=None):
8787

8888
# Fake container FS
8989
container_fs = {}
90+
9091
def fake_read_file_bytes_local(path: str):
9192
return container_fs.get(path, None)
9293

@@ -213,3 +214,31 @@ def test_env_controls_telemetry_src_default(monkeypatch):
213214
ss = importlib.import_module("systemd_stub")
214215
assert ss.IS_V1_ENABLED is False
215216
assert ss._TELEMETRY_SRC.endswith("telemetry.sh")
217+
218+
219+
def test_telemetry_service_syncs_to_host_when_different(ss):
220+
ss, container_fs, host_fs, commands = ss
221+
222+
# Prepare container unit content and host old content
223+
container_fs[ss.CONTAINER_TELEMETRY_SERVICE] = b"UNIT-NEW"
224+
host_fs[ss.HOST_TELEMETRY_SERVICE] = b"UNIT-OLD"
225+
226+
# Only include the telemetry service item to make the assertion clear
227+
ss.SYNC_ITEMS[:] = [
228+
ss.SyncItem(ss.CONTAINER_TELEMETRY_SERVICE, ss.HOST_TELEMETRY_SERVICE, 0o644)
229+
]
230+
231+
# Add post actions for telemetry.service
232+
ss.POST_COPY_ACTIONS[ss.HOST_TELEMETRY_SERVICE] = [
233+
["sudo", "systemctl", "daemon-reload"],
234+
["sudo", "systemctl", "restart", "telemetry"],
235+
]
236+
237+
ok = ss.ensure_sync()
238+
assert ok is True
239+
assert host_fs[ss.HOST_TELEMETRY_SERVICE] == b"UNIT-NEW"
240+
241+
# Verify systemctl actions were invoked
242+
post_cmds = [args for _, args in commands if args and args[0] == "sudo"]
243+
assert ("sudo", "systemctl", "daemon-reload") in post_cmds
244+
assert ("sudo", "systemctl", "restart", "telemetry") in post_cmds

0 commit comments

Comments
 (0)