@@ -74,16 +74,20 @@ func (t *healthCheckTarget) nodeName() string {
7474 return ""
7575}
7676
77- // Determine whether or not a given target needs remediation.
78- // The node will need remediation if any of the following are true:
77+ // needsRemediation determines whether a given target needs remediation.
78+ // The machine will need remediation if any of the following are true:
7979// - The Machine has the remediate machine annotation
80- // - The Machine has failed for some reason
80+ // - Any condition on the machine matches the configured checks and exceeds the timeout
8181// - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses
82- // - The Node has gone away
83- // - Any condition on the node is matched for the given timeout
84- // If the target doesn't currently need rememdiation, provide a duration after
85- // which the target should next be checked.
86- // The target should be requeued after this duration.
82+ // - The Node has been deleted but the Machine still references it
83+ // - Any condition on the node matches the configured checks and exceeds the timeout
84+ //
85+ // Machine conditions are always evaluated first and consistently across all scenarios
86+ // (node missing, node startup timeout, node exists) to ensure comprehensive health checking.
87+ // When multiple issues are detected, error messages are merged to provide complete visibility.
88+ //
89+ // Returns true if remediation is needed, and a duration indicating when to recheck if remediation
90+ // is not immediately required. The target should be requeued after this duration.
8791func (t * healthCheckTarget ) needsRemediation (logger logr.Logger , timeoutForMachineToHaveNode metav1.Duration ) (bool , time.Duration ) {
8892 var nextCheckTimes []time.Duration
8993 now := time .Now ()
@@ -101,20 +105,6 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
101105 return true , time .Duration (0 )
102106 }
103107
104- // Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
105- if t .nodeMissing {
106- logger .V (3 ).Info ("Target is unhealthy: node is missing" )
107- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , clusterv1 .NodeNotFoundV1Beta1Reason , clusterv1 .ConditionSeverityWarning , "" )
108-
109- conditions .Set (t .Machine , metav1.Condition {
110- Type : clusterv1 .MachineHealthCheckSucceededCondition ,
111- Status : metav1 .ConditionFalse ,
112- Reason : clusterv1 .MachineHealthCheckNodeDeletedReason ,
113- Message : fmt .Sprintf ("Health check failed: Node %s has been deleted" , t .Machine .Status .NodeRef .Name ),
114- })
115- return true , time .Duration (0 )
116- }
117-
118108 // Don't penalize any Machine/Node if the control plane has not been initialized
119109 // Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
120110 if ! conditions .IsTrue (t .Cluster , clusterv1 .ClusterControlPlaneInitializedCondition ) && ! util .IsControlPlaneMachine (t .Machine ) {
@@ -130,12 +120,99 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
130120 return false , 0
131121 }
132122
123+ // Collect all unhealthy conditions (both node and machine) to provide comprehensive status
124+ // Always evaluate machine conditions regardless of node state for consistent behavior
125+ var (
126+ unhealthyNodeMessages []string
127+ unhealthyMachineMessages []string
128+ )
129+
130+ // Always check machine conditions first, regardless of node state
131+ for _ , c := range t .MHC .Spec .Checks .UnhealthyMachineConditions {
132+ machineCondition := getMachineCondition (t .Machine , c .Type )
133+
134+ // Skip when current machine condition is different from the one reported
135+ // in the MachineHealthCheck.
136+ if machineCondition == nil || machineCondition .Status != c .Status {
137+ continue
138+ }
139+
140+ // If the machine condition has been in the unhealthy state for longer than the
141+ // timeout, mark as unhealthy and collect the message.
142+ timeoutSecondsDuration := time .Duration (ptr .Deref (c .TimeoutSeconds , 0 )) * time .Second
143+
144+ if machineCondition .LastTransitionTime .Add (timeoutSecondsDuration ).Before (now ) {
145+ unhealthyMachineMessages = append (unhealthyMachineMessages , fmt .Sprintf ("Condition %s on Machine is reporting status %s for more than %s" , c .Type , c .Status , timeoutSecondsDuration .String ()))
146+ logger .V (3 ).Info ("Target is unhealthy: machine condition is in state longer than allowed timeout" , "condition" , c .Type , "state" , c .Status , "timeout" , timeoutSecondsDuration .String ())
147+ continue
148+ }
149+
150+ durationUnhealthy := now .Sub (machineCondition .LastTransitionTime .Time )
151+ nextCheck := timeoutSecondsDuration - durationUnhealthy + time .Second
152+ if nextCheck > 0 {
153+ nextCheckTimes = append (nextCheckTimes , nextCheck )
154+ }
155+ }
156+
157+ // Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
158+ if t .nodeMissing {
159+ logger .V (3 ).Info ("Target is unhealthy: node is missing" )
160+
161+ // Always merge node missing message with any unhealthy machine conditions
162+ nodeMissingMessage := fmt .Sprintf ("Node %s has been deleted" , t .Machine .Status .NodeRef .Name )
163+ allMessages := append ([]string {nodeMissingMessage }, unhealthyMachineMessages ... )
164+
165+ reason := clusterv1 .MachineHealthCheckNodeDeletedReason
166+ v1beta1Reason := clusterv1 .NodeNotFoundV1Beta1Reason
167+ if len (unhealthyMachineMessages ) > 0 {
168+ reason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
169+ v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
170+ }
171+
172+ // For v1beta2 we use a single-line message prefixed with "Health check failed: "
173+ conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (allMessages , "; " ))
174+ conditions .Set (t .Machine , metav1.Condition {
175+ Type : clusterv1 .MachineHealthCheckSucceededCondition ,
176+ Status : metav1 .ConditionFalse ,
177+ Reason : reason ,
178+ Message : conditionMessage ,
179+ })
180+
181+ // For v1beta1 keep the existing format
182+ if len (unhealthyMachineMessages ) > 0 {
183+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (allMessages , "; " ))
184+ } else {
185+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "" )
186+ }
187+ return true , time .Duration (0 )
188+ }
189+
133190 // the node has not been set yet
134191 if t .Node == nil {
192+ // Check if we already have unhealthy machine conditions that should trigger remediation
193+ if len (unhealthyMachineMessages ) > 0 {
194+ reason := clusterv1 .MachineHealthCheckUnhealthyMachineReason
195+ v1beta1Reason := clusterv1 .UnhealthyMachineConditionV1Beta1Reason
196+
197+ // For v1beta2 we use a single-line message prefixed with "Health check failed: "
198+ conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (unhealthyMachineMessages , "; " ))
199+ conditions .Set (t .Machine , metav1.Condition {
200+ Type : clusterv1 .MachineHealthCheckSucceededCondition ,
201+ Status : metav1 .ConditionFalse ,
202+ Reason : reason ,
203+ Message : conditionMessage ,
204+ })
205+
206+ // For v1beta1 keep the existing semicolon-separated format (no prefix).
207+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (unhealthyMachineMessages , "; " ))
208+
209+ return true , time .Duration (0 )
210+ }
211+
135212 if timeoutForMachineToHaveNode == disabledNodeStartupTimeout {
136213 // Startup timeout is disabled so no need to go any further.
137214 // No node yet to check conditions, can return early here.
138- return false , 0
215+ return false , minDuration ( nextCheckTimes )
139216 }
140217
141218 controlPlaneInitialized := conditions .GetLastTransitionTime (t .Cluster , clusterv1 .ClusterControlPlaneInitializedCondition )
@@ -164,32 +241,47 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
164241
165242 timeoutDuration := timeoutForMachineToHaveNode .Duration
166243 if comparisonTime .Add (timeoutForMachineToHaveNode .Duration ).Before (now ) {
167- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , clusterv1 .NodeStartupTimeoutV1Beta1Reason , clusterv1 .ConditionSeverityWarning , "Node failed to report startup in %s" , timeoutDuration )
244+ // Node startup timeout - merge with any unhealthy machine conditions
245+ nodeTimeoutMessage := fmt .Sprintf ("Node failed to report startup in %s" , timeoutDuration )
246+ allMessages := append ([]string {nodeTimeoutMessage }, unhealthyMachineMessages ... )
247+
248+ reason := clusterv1 .MachineHealthCheckNodeStartupTimeoutReason
249+ v1beta1Reason := clusterv1 .NodeStartupTimeoutV1Beta1Reason
250+ if len (unhealthyMachineMessages ) > 0 {
251+ reason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
252+ v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
253+ }
254+
168255 logger .V (3 ).Info ("Target is unhealthy: machine has no node" , "duration" , timeoutDuration )
169256
257+ // For v1beta2 we use a single-line message prefixed with "Health check failed: "
258+ conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (allMessages , "; " ))
170259 conditions .Set (t .Machine , metav1.Condition {
171260 Type : clusterv1 .MachineHealthCheckSucceededCondition ,
172261 Status : metav1 .ConditionFalse ,
173- Reason : clusterv1 . MachineHealthCheckNodeStartupTimeoutReason ,
174- Message : fmt . Sprintf ( "Health check failed: Node failed to report startup in %s" , timeoutDuration ) ,
262+ Reason : reason ,
263+ Message : conditionMessage ,
175264 })
265+
266+ // For v1beta1 keep the existing format
267+ if len (unhealthyMachineMessages ) > 0 {
268+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (allMessages , "; " ))
269+ } else {
270+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "Node failed to report startup in %s" , timeoutDuration )
271+ }
176272 return true , time .Duration (0 )
177273 }
178274
179275 durationUnhealthy := now .Sub (comparisonTime )
180276 nextCheck := timeoutDuration - durationUnhealthy + time .Second
277+ if nextCheck > 0 {
278+ nextCheckTimes = append (nextCheckTimes , nextCheck )
279+ }
181280
182- return false , nextCheck
281+ return false , minDuration ( nextCheckTimes )
183282 }
184283
185- // Collect all unhealthy conditions (both node and machine) to provide comprehensive status
186- var (
187- unhealthyMessages []string
188- unhealthyReasons []string
189- foundUnhealthyCondition bool
190- )
191-
192- // check node conditions
284+ // check node conditions (only when node is available)
193285 for _ , c := range t .MHC .Spec .Checks .UnhealthyNodeConditions {
194286 nodeCondition := getNodeCondition (t .Node , c .Type )
195287
@@ -204,9 +296,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
204296 timeoutSecondsDuration := time .Duration (ptr .Deref (c .TimeoutSeconds , 0 )) * time .Second
205297
206298 if nodeCondition .LastTransitionTime .Add (timeoutSecondsDuration ).Before (now ) {
207- foundUnhealthyCondition = true
208- unhealthyMessages = append (unhealthyMessages , fmt .Sprintf ("Node condition %s is %s for more than %s" , c .Type , c .Status , timeoutSecondsDuration .String ()))
209- unhealthyReasons = append (unhealthyReasons , "UnhealthyNode" )
299+ unhealthyNodeMessages = append (unhealthyNodeMessages , fmt .Sprintf ("Condition %s on Node is reporting status %s for more than %s" , c .Type , c .Status , timeoutSecondsDuration .String ()))
210300 logger .V (3 ).Info ("Target is unhealthy: node condition is in state longer than allowed timeout" , "condition" , c .Type , "state" , c .Status , "timeout" , timeoutSecondsDuration .String ())
211301 continue
212302 }
@@ -218,78 +308,29 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
218308 }
219309 }
220310
221- // check machine conditions
222- for _ , c := range t .MHC .Spec .Checks .UnhealthyMachineConditions {
223- machineCondition := getMachineCondition (t .Machine , c .Type )
224-
225- // Skip when current machine condition is different from the one reported
226- // in the MachineHealthCheck.
227- if machineCondition == nil || machineCondition .Status != c .Status {
228- continue
229- }
230-
231- // If the machine condition has been in the unhealthy state for longer than the
232- // timeout, mark as unhealthy and collect the message.
233- timeoutSecondsDuration := time .Duration (ptr .Deref (c .TimeoutSeconds , 0 )) * time .Second
234-
235- if machineCondition .LastTransitionTime .Add (timeoutSecondsDuration ).Before (now ) {
236- foundUnhealthyCondition = true
237- unhealthyMessages = append (unhealthyMessages , fmt .Sprintf ("Machine condition %s is %s for more than %s" , c .Type , c .Status , timeoutSecondsDuration .String ()))
238- unhealthyReasons = append (unhealthyReasons , "UnhealthyMachine" )
239- logger .V (3 ).Info ("Target is unhealthy: machine condition is in state longer than allowed timeout" , "condition" , c .Type , "state" , c .Status , "timeout" , timeoutSecondsDuration .String ())
240- continue
241- }
242-
243- durationUnhealthy := now .Sub (machineCondition .LastTransitionTime .Time )
244- nextCheck := timeoutSecondsDuration - durationUnhealthy + time .Second
245- if nextCheck > 0 {
246- nextCheckTimes = append (nextCheckTimes , nextCheck )
247- }
248- }
249-
250- // If any unhealthy conditions were found, set the combined status
251- if foundUnhealthyCondition {
252- // Determine the primary reason based on a consistent priority order:
253- // 1. If both node and machine conditions are present, use a combined reason
254- // 2. Otherwise use the specific reason for the type that failed
255- var primaryReason , v1beta1Reason string
256- if len (unhealthyReasons ) > 0 {
257- // Check if we have both node and machine reasons
258- hasNodeReason := false
259- hasMachineReason := false
260- for _ , reason := range unhealthyReasons {
261- switch reason {
262- case "UnhealthyNode" :
263- hasNodeReason = true
264- case "UnhealthyMachine" :
265- hasMachineReason = true
266- }
267- }
268-
269- if hasNodeReason && hasMachineReason {
270- // Both types of conditions are unhealthy - use machine reason but indicate it's combined
271- primaryReason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
272- v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
273- } else if hasMachineReason {
274- primaryReason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
275- v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
276- } else if hasNodeReason {
277- primaryReason = clusterv1 .MachineHealthCheckUnhealthyNodeReason
278- v1beta1Reason = clusterv1 .UnhealthyNodeConditionV1Beta1Reason
279- }
311+ if len (unhealthyNodeMessages ) > 0 || len (unhealthyMachineMessages ) > 0 {
312+ reason := clusterv1 .MachineHealthCheckUnhealthyNodeReason
313+ v1beta1Reason := clusterv1 .UnhealthyNodeConditionV1Beta1Reason
314+ if len (unhealthyMachineMessages ) > 0 {
315+ reason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
316+ v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
280317 }
281318
282319 // Combine all messages into a single comprehensive message
283- combinedMessage := fmt . Sprintf ( "Health check failed: %s" , strings . Join ( unhealthyMessages , "; " ) )
284-
285- v1beta1conditions . MarkFalse ( t . Machine , clusterv1 . MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 . ConditionSeverityWarning , "%s" , combinedMessage )
286-
320+ allMessages := append ( unhealthyNodeMessages , unhealthyMachineMessages ... )
321+ // For v1beta2 we use a single-line message prefixed with "Health check failed: "
322+ // for compatibility with existing tests and other condition messages.
323+ conditionMessage := fmt . Sprintf ( "Health check failed: %s" , strings . Join ( allMessages , "; " ))
287324 conditions .Set (t .Machine , metav1.Condition {
288325 Type : clusterv1 .MachineHealthCheckSucceededCondition ,
289326 Status : metav1 .ConditionFalse ,
290- Reason : primaryReason ,
291- Message : combinedMessage ,
327+ Reason : reason ,
328+ Message : conditionMessage ,
292329 })
330+
331+ // For v1beta1 keep the existing semicolon-separated format (no prefix).
332+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (allMessages , "; " ))
333+
293334 return true , time .Duration (0 )
294335 }
295336
0 commit comments