Skip to content

Commit ece0a4e

Browse files
committed
Consolidate revival logic in case pod is in an unknown, evicted or unreachable state. Limit the number of revivals to 3.
Signed-off-by: Tim Ramlot <[email protected]>
1 parent 0663562 commit ece0a4e

File tree

6 files changed

+113
-52
lines changed

6 files changed

+113
-52
lines changed

pkg/apis/prowjobs/v1/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,8 @@ type ProwJobStatus struct {
10981098
PendingTime *metav1.Time `json:"pendingTime,omitempty"`
10991099
// CompletionTime is the timestamp for when the job goes to a final state
11001100
CompletionTime *metav1.Time `json:"completionTime,omitempty"`
1101+
// Amount of times the Pod was revived from an unexpected stop.
1102+
RevivalCount int `json:"revivalCount,omitempty"`
11011103
// +kubebuilder:validation:Enum=scheduling;triggered;pending;success;failure;aborted;error
11021104
// +kubebuilder:validation:Required
11031105
State ProwJobState `json:"state,omitempty"`

pkg/config/config.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,12 @@ type Plank struct {
655655
// stuck in an unscheduled state. Defaults to 5 minutes.
656656
PodUnscheduledTimeout *metav1.Duration `json:"pod_unscheduled_timeout,omitempty"`
657657

658+
// MaxRevivals is the maximum number of times a prowjob will be retried in case of an
659+
// unexpected stop of the job before being marked as failed. Generally a job is stopped
660+
// unexpectedly due to the underlying Node being terminated, evicted or becoming unreachable.
661+
// Defaults to 3. A value of 0 means no retries.
662+
MaxRevivals *int `json:"max_revivals,omitempty"`
663+
658664
// DefaultDecorationConfigs holds the default decoration config for specific values.
659665
//
660666
// Each entry in the slice specifies Repo and Cluster regexp filter fields to
@@ -2503,6 +2509,11 @@ func parseProwConfig(c *Config) error {
25032509
c.Plank.PodUnscheduledTimeout = &metav1.Duration{Duration: 5 * time.Minute}
25042510
}
25052511

2512+
if c.Plank.MaxRevivals == nil {
2513+
maxRetries := 3
2514+
c.Plank.MaxRevivals = &maxRetries
2515+
}
2516+
25062517
if err := c.Gerrit.DefaultAndValidate(); err != nil {
25072518
return fmt.Errorf("validating gerrit config: %w", err)
25082519
}

pkg/config/config_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8426,6 +8426,7 @@ moonraker:
84268426
client_timeout: 10m0s
84278427
plank:
84288428
max_goroutines: 20
8429+
max_revivals: 3
84298430
pod_pending_timeout: 10m0s
84308431
pod_running_timeout: 48h0m0s
84318432
pod_unscheduled_timeout: 5m0s
@@ -8510,6 +8511,7 @@ moonraker:
85108511
client_timeout: 10m0s
85118512
plank:
85128513
max_goroutines: 20
8514+
max_revivals: 3
85138515
pod_pending_timeout: 10m0s
85148516
pod_running_timeout: 48h0m0s
85158517
pod_unscheduled_timeout: 5m0s
@@ -8587,6 +8589,7 @@ moonraker:
85878589
client_timeout: 10m0s
85888590
plank:
85898591
max_goroutines: 20
8592+
max_revivals: 3
85908593
pod_pending_timeout: 10m0s
85918594
pod_running_timeout: 48h0m0s
85928595
pod_unscheduled_timeout: 5m0s
@@ -8669,6 +8672,7 @@ moonraker:
86698672
client_timeout: 10m0s
86708673
plank:
86718674
max_goroutines: 20
8675+
max_revivals: 3
86728676
pod_pending_timeout: 10m0s
86738677
pod_running_timeout: 48h0m0s
86748678
pod_unscheduled_timeout: 5m0s

pkg/config/prow-config-documented.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,11 @@ plank:
13011301
# JobURLPrefixDisableAppendStorageProvider disables that the storageProvider is
13021302
# automatically appended to the JobURLPrefix.
13031303
jobURLPrefixDisableAppendStorageProvider: true
1304+
# MaxRevivals is the maximum number of times a prowjob will be retried in case of an
1305+
# unexpected stop of the job before being marked as failed. Generally a job is stopped
1306+
# unexpectedly due to the underlying Node being terminated, evicted or becoming unreachable.
1307+
# Defaults to 3. A value of 0 means no retries.
1308+
max_revivals: 0
13041309
# PodPendingTimeout defines how long the controller will wait to perform a garbage
13051310
# collection on pending pods. Defaults to 10 minutes.
13061311
pod_pending_timeout: 0s

pkg/plank/controller_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ const (
6565
podDeletionPreventionFinalizer = "keep-from-vanishing"
6666
)
6767

68+
var maxRevivals = 3
69+
6870
func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
6971
presubmits := []config.Presubmit{
7072
{
@@ -106,6 +108,7 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
106108
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
107109
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
108110
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
111+
MaxRevivals: &maxRevivals,
109112
},
110113
},
111114
JobConfig: config.JobConfig{
@@ -1180,6 +1183,41 @@ func TestSyncPendingJob(t *testing.T) {
11801183
ExpectedNumPods: 1,
11811184
ExpectedURL: "boop-42/error",
11821185
},
1186+
{
1187+
// TODO: this test case tests the current behavior, but the behavior
1188+
// is non-ideal: the pod execution did not fail, instead the node on which
1189+
// the pod was running terminated
1190+
Name: "a terminated pod is handled as-if it failed",
1191+
PJ: prowapi.ProwJob{
1192+
ObjectMeta: metav1.ObjectMeta{
1193+
Name: "boop-42",
1194+
Namespace: "prowjobs",
1195+
},
1196+
Spec: prowapi.ProwJobSpec{
1197+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1198+
},
1199+
Status: prowapi.ProwJobStatus{
1200+
State: prowapi.PendingState,
1201+
PodName: "boop-42",
1202+
},
1203+
},
1204+
Pods: []v1.Pod{
1205+
{
1206+
ObjectMeta: metav1.ObjectMeta{
1207+
Name: "boop-42",
1208+
Namespace: "pods",
1209+
},
1210+
Status: v1.PodStatus{
1211+
Phase: v1.PodFailed,
1212+
Reason: Terminated,
1213+
},
1214+
},
1215+
},
1216+
ExpectedComplete: true,
1217+
ExpectedState: prowapi.FailureState,
1218+
ExpectedNumPods: 1,
1219+
ExpectedURL: "boop-42/error",
1220+
},
11831221
{
11841222
Name: "running pod",
11851223
PJ: prowapi.ProwJob{

pkg/plank/reconciler.go

Lines changed: 53 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ const ControllerName = "plank"
6262

6363
// PodStatus constants
6464
const (
65-
Evicted = "Evicted"
65+
Evicted = "Evicted"
66+
Terminated = "Terminated"
6667
)
6768

6869
// NodeStatus constants
@@ -468,76 +469,51 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
468469
pj.Status.PodName = pn
469470
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
470471
}
471-
} else if pod.Status.Reason == Evicted {
472-
// Pod was evicted.
473-
if pj.Spec.ErrorOnEviction {
474-
// ErrorOnEviction is enabled, complete the PJ and mark it as
475-
// errored.
472+
} else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
473+
switch {
474+
case podUnexpectedStopCause == PodUnexpectedStopCauseEvicted && pj.Spec.ErrorOnEviction:
475+
// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
476476
r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.")
477477
pj.SetComplete()
478478
pj.Status.State = prowv1.ErrorState
479479
pj.Status.Description = "Job pod was evicted by the cluster."
480-
} else {
481-
// ErrorOnEviction is disabled. Delete the pod now and recreate it in
482-
// the next resync.
483-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, deleting & next sync loop will restart pod")
480+
case pj.Status.RevivalCount >= *r.config().Plank.MaxRevivals:
481+
// MaxRevivals is reached, complete the PJ and mark it as errored.
482+
r.log.WithField("unexpected-stop-cause", podUnexpectedStopCause).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
483+
pj.SetComplete()
484+
pj.Status.State = prowv1.ErrorState
485+
pj.Status.Description = fmt.Sprintf("Job pod reached max revivals (%d) after being stopped unexpectedly (%s)", pj.Status.RevivalCount, podUnexpectedStopCause)
486+
default:
487+
// Update the revival count and delete the pod so it gets recreated in the next resync.
488+
pj.Status.RevivalCount++
489+
r.log.
490+
WithField("unexpected-stop-cause", podUnexpectedStopCause).
491+
WithFields(pjutil.ProwJobFields(pj)).
492+
Info("Pod has stopped unexpectedly, deleting & next sync loop will restart pod")
493+
484494
client, ok := r.buildClients[pj.ClusterAlias()]
485495
if !ok {
486-
return nil, TerminalError(fmt.Errorf("evicted pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
496+
return nil, TerminalError(fmt.Errorf("pod %s which was stopped unexpectedly (%s): unknown cluster alias %q", pod.Name, podUnexpectedStopCause, pj.ClusterAlias()))
487497
}
488-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
498+
if finalizers := sets.New(pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
489499
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
490500
oldPod := pod.DeepCopy()
491501
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
492502
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
493503
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
494504
}
495505
}
496-
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
497-
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
498-
}
499-
} else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
500-
// This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
501-
// vanishes and we will silently re-create it in the next iteration.
502-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got lost, deleting & next sync loop will restart pod")
503-
client, ok := r.buildClients[pj.ClusterAlias()]
504-
if !ok {
505-
return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
506-
}
507506

508-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
509-
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
510-
oldPod := pod.DeepCopy()
511-
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
512-
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
513-
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
514-
}
515-
}
516-
517-
return nil, nil
518-
} else {
519-
switch pod.Status.Phase {
520-
case corev1.PodUnknown:
521-
// Pod is in Unknown state. This can happen if there is a problem with
522-
// the node. Delete the old pod, this will fire an event that triggers
523-
// a new reconciliation in which we will re-create the pod.
524-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is in unknown state, deleting & restarting pod")
525-
client, ok := r.buildClients[pj.ClusterAlias()]
526-
if !ok {
527-
return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
507+
// Pod is already deleted, so we don't need to delete it again.
508+
if pod.DeletionTimestamp != nil {
509+
return nil, nil
528510
}
529511

530-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
531-
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
532-
oldPod := pod.DeepCopy()
533-
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
534-
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
535-
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
536-
}
537-
}
538512
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
539513
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
540-
514+
}
515+
} else {
516+
switch pod.Status.Phase {
541517
case corev1.PodSucceeded:
542518
pj.SetComplete()
543519
// There were bugs around this in the past so be paranoid and verify each container
@@ -679,6 +655,31 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
679655
return nil, nil
680656
}
681657

658+
type PodUnexpectedStopCause string
659+
660+
const (
661+
PodUnexpectedStopCauseNone PodUnexpectedStopCause = ""
662+
PodUnexpectedStopCauseUnknown PodUnexpectedStopCause = "unknown"
663+
PodUnexpectedStopCauseEvicted PodUnexpectedStopCause = "evicted"
664+
PodUnexpectedStopCauseUnreachable PodUnexpectedStopCause = "unreachable"
665+
)
666+
667+
func getPodUnexpectedStopCause(pod *corev1.Pod) PodUnexpectedStopCause {
668+
if pod.Status.Reason == Evicted {
669+
return PodUnexpectedStopCauseEvicted
670+
}
671+
672+
if pod.Status.Reason == NodeUnreachablePodReason && pod.DeletionTimestamp != nil {
673+
return PodUnexpectedStopCauseUnreachable
674+
}
675+
676+
if pod.Status.Phase == corev1.PodUnknown {
677+
return PodUnexpectedStopCauseUnknown
678+
}
679+
680+
return PodUnexpectedStopCauseNone
681+
}
682+
682683
// syncTriggeredJob syncs jobs that do not yet have an associated test workload running
683684
func (r *reconciler) syncTriggeredJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
684685
prevPJ := pj.DeepCopy()

0 commit comments

Comments
 (0)