Improve health check logic in getClusterHealthStatus function to handle shutdown delay or missing readyz endpoint

yanfeng1992 · yanfeng1992 · commit be5ff73c7da7 · 2025-04-11T14:46:51.000+08:00
Signed-off-by: huangyanfeng &lt;huangyanfeng1992@gmail.com&gt;
diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go
@@ -435,9 +435,11 @@ func (c *ClusterStatusController) initLeaseController(cluster *clusterv1alpha1.C
 
 func getClusterHealthStatus(clusterClient *util.ClusterClient) (online, healthy bool) {
 	healthStatus, err := healthEndpointCheck(clusterClient.KubeClient, "/readyz")
-	if err != nil && healthStatus == http.StatusNotFound {
-		// do health check with healthz endpoint if the readyz endpoint is not installed in member cluster
-		healthStatus, err = healthEndpointCheck(clusterClient.KubeClient, "/healthz")
+	if err != nil && (healthStatus == http.StatusInternalServerError || healthStatus == http.StatusNotFound) {
+		// do health check with healthz endpoint in two cases:
+		// 1. StatusInternalServerError(500): When the server is configured with --shutdown-delay-duration, /readyz returns failure but /healthz still serves success 
+		// 2. StatusNotFound(404): When the readyz endpoint is not installed in member cluster
+		healthStatus, err = healthEndpointCheck(clusterClient.KubeClient, "/healthz") 
 	}
 
 	if err != nil {