Skip to content

Commit a8266e7

Browse files
Address review comments Fabrizio (mhc target, mhc controller code)
Refactors `needsRemediation`, specifically following changes were made: - Move machine condition evaluation to always execute first, regardless of node state - Ensure machine conditions are checked in ALL scenarios: * When node is missing (t.nodeMissing) * When node hasn't appeared yet (t.Node == nil) * When node exists (t.Node != nil) - Consistently merge node and machine condition messages in all failure scenarios - Maintain backward compatibility with existing condition message formats - Use appropriate condition reasons based on which conditions are unhealthy Signed-off-by: Furkat Gofurov <[email protected]>
1 parent d807911 commit a8266e7

File tree

2 files changed

+153
-110
lines changed

2 files changed

+153
-110
lines changed

internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,6 +1332,15 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) {
13321332

13331333
mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
13341334

1335+
// Add UnhealthyMachineConditions to test machine condition evaluation
1336+
mhc.Spec.Checks.UnhealthyMachineConditions = []clusterv1.UnhealthyMachineCondition{
1337+
{
1338+
Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition,
1339+
Status: metav1.ConditionFalse,
1340+
TimeoutSeconds: ptr.To(int32(5 * 60)),
1341+
},
1342+
}
1343+
13351344
g.Expect(env.Create(ctx, mhc)).To(Succeed())
13361345
defer func(do ...client.Object) {
13371346
g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
@@ -2879,13 +2888,6 @@ func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHeal
28792888
TimeoutSeconds: ptr.To(int32(5 * 60)),
28802889
},
28812890
},
2882-
UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{
2883-
{
2884-
Type: clusterv1.MachineReadyCondition,
2885-
Status: metav1.ConditionUnknown,
2886-
TimeoutSeconds: ptr.To(int32(5 * 60)),
2887-
},
2888-
},
28892891
},
28902892
Remediation: clusterv1.MachineHealthCheckRemediation{
28912893
TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 144 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,20 @@ func (t *healthCheckTarget) nodeName() string {
7474
return ""
7575
}
7676

77-
// Determine whether or not a given target needs remediation.
78-
// The node will need remediation if any of the following are true:
77+
// needsRemediation determines whether a given target needs remediation.
78+
// The machine will need remediation if any of the following are true:
7979
// - The Machine has the remediate machine annotation
80-
// - The Machine has failed for some reason
80+
// - Any condition on the machine matches the configured checks and exceeds the timeout
8181
// - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses
82-
// - The Node has gone away
83-
// - Any condition on the node is matched for the given timeout
84-
// If the target doesn't currently need rememdiation, provide a duration after
85-
// which the target should next be checked.
86-
// The target should be requeued after this duration.
82+
// - The Node has been deleted but the Machine still references it
83+
// - Any condition on the node matches the configured checks and exceeds the timeout
84+
//
85+
// Machine conditions are always evaluated first and consistently across all scenarios
86+
// (node missing, node startup timeout, node exists) to ensure comprehensive health checking.
87+
// When multiple issues are detected, error messages are merged to provide complete visibility.
88+
//
89+
// Returns true if remediation is needed, and a duration indicating when to recheck if remediation
90+
// is not immediately required. The target should be requeued after this duration.
8791
func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) {
8892
var nextCheckTimes []time.Duration
8993
now := time.Now()
@@ -101,20 +105,6 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
101105
return true, time.Duration(0)
102106
}
103107

104-
// Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
105-
if t.nodeMissing {
106-
logger.V(3).Info("Target is unhealthy: node is missing")
107-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.NodeNotFoundV1Beta1Reason, clusterv1.ConditionSeverityWarning, "")
108-
109-
conditions.Set(t.Machine, metav1.Condition{
110-
Type: clusterv1.MachineHealthCheckSucceededCondition,
111-
Status: metav1.ConditionFalse,
112-
Reason: clusterv1.MachineHealthCheckNodeDeletedReason,
113-
Message: fmt.Sprintf("Health check failed: Node %s has been deleted", t.Machine.Status.NodeRef.Name),
114-
})
115-
return true, time.Duration(0)
116-
}
117-
118108
// Don't penalize any Machine/Node if the control plane has not been initialized
119109
// Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
120110
if !conditions.IsTrue(t.Cluster, clusterv1.ClusterControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) {
@@ -130,12 +120,99 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
130120
return false, 0
131121
}
132122

123+
// Collect all unhealthy conditions (both node and machine) to provide comprehensive status
124+
// Always evaluate machine conditions regardless of node state for consistent behavior
125+
var (
126+
unhealthyNodeMessages []string
127+
unhealthyMachineMessages []string
128+
)
129+
130+
// Always check machine conditions first, regardless of node state
131+
for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions {
132+
machineCondition := getMachineCondition(t.Machine, c.Type)
133+
134+
// Skip when current machine condition is different from the one reported
135+
// in the MachineHealthCheck.
136+
if machineCondition == nil || machineCondition.Status != c.Status {
137+
continue
138+
}
139+
140+
// If the machine condition has been in the unhealthy state for longer than the
141+
// timeout, mark as unhealthy and collect the message.
142+
timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
143+
144+
if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
145+
unhealthyMachineMessages = append(unhealthyMachineMessages, fmt.Sprintf("Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
146+
logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
147+
continue
148+
}
149+
150+
durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time)
151+
nextCheck := timeoutSecondsDuration - durationUnhealthy + time.Second
152+
if nextCheck > 0 {
153+
nextCheckTimes = append(nextCheckTimes, nextCheck)
154+
}
155+
}
156+
157+
// Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
158+
if t.nodeMissing {
159+
logger.V(3).Info("Target is unhealthy: node is missing")
160+
161+
// Always merge node missing message with any unhealthy machine conditions
162+
nodeMissingMessage := fmt.Sprintf("Node %s has been deleted", t.Machine.Status.NodeRef.Name)
163+
allMessages := append([]string{nodeMissingMessage}, unhealthyMachineMessages...)
164+
165+
reason := clusterv1.MachineHealthCheckNodeDeletedReason
166+
v1beta1Reason := clusterv1.NodeNotFoundV1Beta1Reason
167+
if len(unhealthyMachineMessages) > 0 {
168+
reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
169+
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
170+
}
171+
172+
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
173+
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
174+
conditions.Set(t.Machine, metav1.Condition{
175+
Type: clusterv1.MachineHealthCheckSucceededCondition,
176+
Status: metav1.ConditionFalse,
177+
Reason: reason,
178+
Message: conditionMessage,
179+
})
180+
181+
// For v1beta1 keep the existing format
182+
if len(unhealthyMachineMessages) > 0 {
183+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
184+
} else {
185+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "")
186+
}
187+
return true, time.Duration(0)
188+
}
189+
133190
// the node has not been set yet
134191
if t.Node == nil {
192+
// Check if we already have unhealthy machine conditions that should trigger remediation
193+
if len(unhealthyMachineMessages) > 0 {
194+
reason := clusterv1.MachineHealthCheckUnhealthyMachineReason
195+
v1beta1Reason := clusterv1.UnhealthyMachineConditionV1Beta1Reason
196+
197+
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
198+
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMachineMessages, "; "))
199+
conditions.Set(t.Machine, metav1.Condition{
200+
Type: clusterv1.MachineHealthCheckSucceededCondition,
201+
Status: metav1.ConditionFalse,
202+
Reason: reason,
203+
Message: conditionMessage,
204+
})
205+
206+
// For v1beta1 keep the existing semicolon-separated format (no prefix).
207+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(unhealthyMachineMessages, "; "))
208+
209+
return true, time.Duration(0)
210+
}
211+
135212
if timeoutForMachineToHaveNode == disabledNodeStartupTimeout {
136213
// Startup timeout is disabled so no need to go any further.
137214
// No node yet to check conditions, can return early here.
138-
return false, 0
215+
return false, minDuration(nextCheckTimes)
139216
}
140217

141218
controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ClusterControlPlaneInitializedCondition)
@@ -164,32 +241,47 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
164241

165242
timeoutDuration := timeoutForMachineToHaveNode.Duration
166243
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
167-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.NodeStartupTimeoutV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
244+
// Node startup timeout - merge with any unhealthy machine conditions
245+
nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration)
246+
allMessages := append([]string{nodeTimeoutMessage}, unhealthyMachineMessages...)
247+
248+
reason := clusterv1.MachineHealthCheckNodeStartupTimeoutReason
249+
v1beta1Reason := clusterv1.NodeStartupTimeoutV1Beta1Reason
250+
if len(unhealthyMachineMessages) > 0 {
251+
reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
252+
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
253+
}
254+
168255
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration)
169256

257+
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
258+
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
170259
conditions.Set(t.Machine, metav1.Condition{
171260
Type: clusterv1.MachineHealthCheckSucceededCondition,
172261
Status: metav1.ConditionFalse,
173-
Reason: clusterv1.MachineHealthCheckNodeStartupTimeoutReason,
174-
Message: fmt.Sprintf("Health check failed: Node failed to report startup in %s", timeoutDuration),
262+
Reason: reason,
263+
Message: conditionMessage,
175264
})
265+
266+
// For v1beta1 keep the existing format
267+
if len(unhealthyMachineMessages) > 0 {
268+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
269+
} else {
270+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
271+
}
176272
return true, time.Duration(0)
177273
}
178274

179275
durationUnhealthy := now.Sub(comparisonTime)
180276
nextCheck := timeoutDuration - durationUnhealthy + time.Second
277+
if nextCheck > 0 {
278+
nextCheckTimes = append(nextCheckTimes, nextCheck)
279+
}
181280

182-
return false, nextCheck
281+
return false, minDuration(nextCheckTimes)
183282
}
184283

185-
// Collect all unhealthy conditions (both node and machine) to provide comprehensive status
186-
var (
187-
unhealthyMessages []string
188-
unhealthyReasons []string
189-
foundUnhealthyCondition bool
190-
)
191-
192-
// check node conditions
284+
// check node conditions (only when node is available)
193285
for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions {
194286
nodeCondition := getNodeCondition(t.Node, c.Type)
195287

@@ -204,9 +296,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
204296
timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
205297

206298
if nodeCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
207-
foundUnhealthyCondition = true
208-
unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Node condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
209-
unhealthyReasons = append(unhealthyReasons, "UnhealthyNode")
299+
unhealthyNodeMessages = append(unhealthyNodeMessages, fmt.Sprintf("Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
210300
logger.V(3).Info("Target is unhealthy: node condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
211301
continue
212302
}
@@ -218,78 +308,29 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
218308
}
219309
}
220310

221-
// check machine conditions
222-
for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions {
223-
machineCondition := getMachineCondition(t.Machine, c.Type)
224-
225-
// Skip when current machine condition is different from the one reported
226-
// in the MachineHealthCheck.
227-
if machineCondition == nil || machineCondition.Status != c.Status {
228-
continue
229-
}
230-
231-
// If the machine condition has been in the unhealthy state for longer than the
232-
// timeout, mark as unhealthy and collect the message.
233-
timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
234-
235-
if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
236-
foundUnhealthyCondition = true
237-
unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Machine condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
238-
unhealthyReasons = append(unhealthyReasons, "UnhealthyMachine")
239-
logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
240-
continue
241-
}
242-
243-
durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time)
244-
nextCheck := timeoutSecondsDuration - durationUnhealthy + time.Second
245-
if nextCheck > 0 {
246-
nextCheckTimes = append(nextCheckTimes, nextCheck)
247-
}
248-
}
249-
250-
// If any unhealthy conditions were found, set the combined status
251-
if foundUnhealthyCondition {
252-
// Determine the primary reason based on a consistent priority order:
253-
// 1. If both node and machine conditions are present, use a combined reason
254-
// 2. Otherwise use the specific reason for the type that failed
255-
var primaryReason, v1beta1Reason string
256-
if len(unhealthyReasons) > 0 {
257-
// Check if we have both node and machine reasons
258-
hasNodeReason := false
259-
hasMachineReason := false
260-
for _, reason := range unhealthyReasons {
261-
switch reason {
262-
case "UnhealthyNode":
263-
hasNodeReason = true
264-
case "UnhealthyMachine":
265-
hasMachineReason = true
266-
}
267-
}
268-
269-
if hasNodeReason && hasMachineReason {
270-
// Both types of conditions are unhealthy - use machine reason but indicate it's combined
271-
primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason
272-
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
273-
} else if hasMachineReason {
274-
primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason
275-
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
276-
} else if hasNodeReason {
277-
primaryReason = clusterv1.MachineHealthCheckUnhealthyNodeReason
278-
v1beta1Reason = clusterv1.UnhealthyNodeConditionV1Beta1Reason
279-
}
311+
if len(unhealthyNodeMessages) > 0 || len(unhealthyMachineMessages) > 0 {
312+
reason := clusterv1.MachineHealthCheckUnhealthyNodeReason
313+
v1beta1Reason := clusterv1.UnhealthyNodeConditionV1Beta1Reason
314+
if len(unhealthyMachineMessages) > 0 {
315+
reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
316+
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
280317
}
281318

282319
// Combine all messages into a single comprehensive message
283-
combinedMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMessages, "; "))
284-
285-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", combinedMessage)
286-
320+
allMessages := append(unhealthyNodeMessages, unhealthyMachineMessages...)
321+
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
322+
// for compatibility with existing tests and other condition messages.
323+
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
287324
conditions.Set(t.Machine, metav1.Condition{
288325
Type: clusterv1.MachineHealthCheckSucceededCondition,
289326
Status: metav1.ConditionFalse,
290-
Reason: primaryReason,
291-
Message: combinedMessage,
327+
Reason: reason,
328+
Message: conditionMessage,
292329
})
330+
331+
// For v1beta1 keep the existing semicolon-separated format (no prefix).
332+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
333+
293334
return true, time.Duration(0)
294335
}
295336

0 commit comments

Comments
 (0)