feat: warn on probe failing

mickael-carl · mickael-carl · commit 5bde5e364f4f · 2025-11-13T10:42:40.000Z
We have a fairly common occurence of KSM getting restarted due to its
liveness probe failing. This is seemingly caused by rolling deploys of
the backing API server, but we don't know exactly why. This should make
it slightly more visible what kind of response the API server when a
probe fails, which in turn could help either tweak probes to weather
such situations where the API server may be unavailable or overwhelmed
(it's entirely possible it's returning KSM a bunch of 429s) or if we
have a larger problem at hand.
diff --git a/pkg/app/server.go b/pkg/app/server.go
@@ -513,8 +513,10 @@ func buildTelemetryServer(registry prometheus.Gatherer, authFilter bool, kubeCon
 
 func handleClusterDelegationForProber(client kubernetes.Interface, probeType string) http.HandlerFunc {
 	return func(w http.ResponseWriter, _ *http.Request) {
-		got := client.CoreV1().RESTClient().Get().AbsPath(probeType).Do(context.Background())
+		var statusCode int
+		got := client.CoreV1().RESTClient().Get().AbsPath(probeType).Do(context.Background()).StatusCode(&statusCode)
 		if got.Error() != nil {
+			klog.Warningf("Failed to contact API server for %s: got %d", probeType, statusCode)
 			w.WriteHeader(http.StatusServiceUnavailable)
 			w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
 			return