Consolidate revival logic in case pod is in an unknown, evicted or unreachable state. Limit the number of revivals to 3.

inteon · inteon · commit ece0a4ee4057 · 2025-03-28T10:08:53.000Z
Signed-off-by: Tim Ramlot &lt;42113979+inteon@users.noreply.github.com&gt;
diff --git a/pkg/apis/prowjobs/v1/types.go b/pkg/apis/prowjobs/v1/types.go
@@ -1098,6 +1098,8 @@ type ProwJobStatus struct {
 	PendingTime *metav1.Time `json:"pendingTime,omitempty"`
 	// CompletionTime is the timestamp for when the job goes to a final state
 	CompletionTime *metav1.Time `json:"completionTime,omitempty"`
+	// Amount of times the Pod was revived from an unexpected stop.
+	RevivalCount int `json:"revivalCount,omitempty"`
 	// +kubebuilder:validation:Enum=scheduling;triggered;pending;success;failure;aborted;error
 	// +kubebuilder:validation:Required
 	State       ProwJobState `json:"state,omitempty"`
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -655,6 +655,12 @@ type Plank struct {
 	// stuck in an unscheduled state. Defaults to 5 minutes.
 	PodUnscheduledTimeout *metav1.Duration `json:"pod_unscheduled_timeout,omitempty"`
 
+	// MaxRevivals is the maximum number of times a prowjob will be retried in case of an
+	// unexpected stop of the job before being marked as failed. Generally a job is stopped
+	// unexpectedly due to the underlying Node being terminated, evicted or becoming unreachable.
+	// Defaults to 3. A value of 0 means no retries.
+	MaxRevivals *int `json:"max_revivals,omitempty"`
+
 	// DefaultDecorationConfigs holds the default decoration config for specific values.
 	//
 	// Each entry in the slice specifies Repo and Cluster regexp filter fields to
@@ -2503,6 +2509,11 @@ func parseProwConfig(c *Config) error {
 		c.Plank.PodUnscheduledTimeout = &metav1.Duration{Duration: 5 * time.Minute}
 	}
 
+	if c.Plank.MaxRevivals == nil {
+		maxRetries := 3
+		c.Plank.MaxRevivals = &maxRetries
+	}
+
 	if err := c.Gerrit.DefaultAndValidate(); err != nil {
 		return fmt.Errorf("validating gerrit config: %w", err)
 	}
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
@@ -8426,6 +8426,7 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_revivals: 3
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
@@ -8510,6 +8511,7 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_revivals: 3
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
@@ -8587,6 +8589,7 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_revivals: 3
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
@@ -8669,6 +8672,7 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_revivals: 3
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
diff --git a/pkg/config/prow-config-documented.yaml b/pkg/config/prow-config-documented.yaml
@@ -1301,6 +1301,11 @@ plank:
     # JobURLPrefixDisableAppendStorageProvider disables that the storageProvider is
     # automatically appended to the JobURLPrefix.
     jobURLPrefixDisableAppendStorageProvider: true
+    # MaxRevivals is the maximum number of times a prowjob will be retried in case of an
+    # unexpected stop of the job before being marked as failed. Generally a job is stopped
+    # unexpectedly due to the underlying Node being terminated, evicted or becoming unreachable.
+    # Defaults to 3. A value of 0 means no retries.
+    max_revivals: 0
     # PodPendingTimeout defines how long the controller will wait to perform a garbage
     # collection on pending pods. Defaults to 10 minutes.
     pod_pending_timeout: 0s
diff --git a/pkg/plank/controller_test.go b/pkg/plank/controller_test.go
@@ -65,6 +65,8 @@ const (
 	podDeletionPreventionFinalizer = "keep-from-vanishing"
 )
 
+var maxRevivals = 3
+
 func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
 	presubmits := []config.Presubmit{
 		{
@@ -106,6 +108,7 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
 					PodPendingTimeout:     &metav1.Duration{Duration: podPendingTimeout},
 					PodRunningTimeout:     &metav1.Duration{Duration: podRunningTimeout},
 					PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
+					MaxRevivals:           &maxRevivals,
 				},
 			},
 			JobConfig: config.JobConfig{
@@ -1180,6 +1183,41 @@ func TestSyncPendingJob(t *testing.T) {
 			ExpectedNumPods:  1,
 			ExpectedURL:      "boop-42/error",
 		},
+		{
+			// TODO: this test case tests the current behavior, but the behavior
+			// is non-ideal: the pod execution did not fail, instead the node on which
+			// the pod was running terminated
+			Name: "a terminated pod is handled as-if it failed",
+			PJ: prowapi.ProwJob{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "boop-42",
+					Namespace: "prowjobs",
+				},
+				Spec: prowapi.ProwJobSpec{
+					PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
+				},
+				Status: prowapi.ProwJobStatus{
+					State:   prowapi.PendingState,
+					PodName: "boop-42",
+				},
+			},
+			Pods: []v1.Pod{
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "boop-42",
+						Namespace: "pods",
+					},
+					Status: v1.PodStatus{
+						Phase:  v1.PodFailed,
+						Reason: Terminated,
+					},
+				},
+			},
+			ExpectedComplete: true,
+			ExpectedState:    prowapi.FailureState,
+			ExpectedNumPods:  1,
+			ExpectedURL:      "boop-42/error",
+		},
 		{
 			Name: "running pod",
 			PJ: prowapi.ProwJob{
diff --git a/pkg/plank/reconciler.go b/pkg/plank/reconciler.go
@@ -62,7 +62,8 @@ const ControllerName = "plank"
 
 // PodStatus constants
 const (
-	Evicted = "Evicted"
+	Evicted    = "Evicted"
+	Terminated = "Terminated"
 )
 
 // NodeStatus constants
@@ -468,76 +469,51 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
 			pj.Status.PodName = pn
 			r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
 		}
-	} else if pod.Status.Reason == Evicted {
-		// Pod was evicted.
-		if pj.Spec.ErrorOnEviction {
-			// ErrorOnEviction is enabled, complete the PJ and mark it as
-			// errored.
+	} else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
+		switch {
+		case podUnexpectedStopCause == PodUnexpectedStopCauseEvicted && pj.Spec.ErrorOnEviction:
+			// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
 			r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.")
 			pj.SetComplete()
 			pj.Status.State = prowv1.ErrorState
 			pj.Status.Description = "Job pod was evicted by the cluster."
-		} else {
-			// ErrorOnEviction is disabled. Delete the pod now and recreate it in
-			// the next resync.
-			r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, deleting & next sync loop will restart pod")
+		case pj.Status.RevivalCount >= *r.config().Plank.MaxRevivals:
+			// MaxRevivals is reached, complete the PJ and mark it as errored.
+			r.log.WithField("unexpected-stop-cause", podUnexpectedStopCause).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
+			pj.SetComplete()
+			pj.Status.State = prowv1.ErrorState
+			pj.Status.Description = fmt.Sprintf("Job pod reached max revivals (%d) after being stopped unexpectedly (%s)", pj.Status.RevivalCount, podUnexpectedStopCause)
+		default:
+			// Update the revival count and delete the pod so it gets recreated in the next resync.
+			pj.Status.RevivalCount++
+			r.log.
+				WithField("unexpected-stop-cause", podUnexpectedStopCause).
+				WithFields(pjutil.ProwJobFields(pj)).
+				Info("Pod has stopped unexpectedly, deleting & next sync loop will restart pod")
+
 			client, ok := r.buildClients[pj.ClusterAlias()]
 			if !ok {
-				return nil, TerminalError(fmt.Errorf("evicted pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
+				return nil, TerminalError(fmt.Errorf("pod %s which was stopped unexpectedly (%s): unknown cluster alias %q", pod.Name, podUnexpectedStopCause, pj.ClusterAlias()))
 			}
-			if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
+			if finalizers := sets.New(pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
 				// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
 				oldPod := pod.DeepCopy()
 				pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
 				if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
 					return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
 				}
 			}
-			r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
-			return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
-		}
-	} else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
-		// This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
-		// vanishes and we will silently re-create it in the next iteration.
-		r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got lost, deleting & next sync loop will restart pod")
-		client, ok := r.buildClients[pj.ClusterAlias()]
-		if !ok {
-			return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
-		}
 
-		if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
-			// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
-			oldPod := pod.DeepCopy()
-			pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
-			if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
-				return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
-			}
-		}
-
-		return nil, nil
-	} else {
-		switch pod.Status.Phase {
-		case corev1.PodUnknown:
-			// Pod is in Unknown state. This can happen if there is a problem with
-			// the node. Delete the old pod, this will fire an event that triggers
-			// a new reconciliation in which we will re-create the pod.
-			r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is in unknown state, deleting & restarting pod")
-			client, ok := r.buildClients[pj.ClusterAlias()]
-			if !ok {
-				return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
+			// Pod is already deleted, so we don't need to delete it again.
+			if pod.DeletionTimestamp != nil {
+				return nil, nil
 			}
 
-			if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
-				// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
-				oldPod := pod.DeepCopy()
-				pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
-				if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
-					return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
-				}
-			}
 			r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
 			return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
-
+		}
+	} else {
+		switch pod.Status.Phase {
 		case corev1.PodSucceeded:
 			pj.SetComplete()
 			// There were bugs around this in the past so be paranoid and verify each container
@@ -679,6 +655,31 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
 	return nil, nil
 }
 
+type PodUnexpectedStopCause string
+
+const (
+	PodUnexpectedStopCauseNone        PodUnexpectedStopCause = ""
+	PodUnexpectedStopCauseUnknown     PodUnexpectedStopCause = "unknown"
+	PodUnexpectedStopCauseEvicted     PodUnexpectedStopCause = "evicted"
+	PodUnexpectedStopCauseUnreachable PodUnexpectedStopCause = "unreachable"
+)
+
+func getPodUnexpectedStopCause(pod *corev1.Pod) PodUnexpectedStopCause {
+	if pod.Status.Reason == Evicted {
+		return PodUnexpectedStopCauseEvicted
+	}
+
+	if pod.Status.Reason == NodeUnreachablePodReason && pod.DeletionTimestamp != nil {
+		return PodUnexpectedStopCauseUnreachable
+	}
+
+	if pod.Status.Phase == corev1.PodUnknown {
+		return PodUnexpectedStopCauseUnknown
+	}
+
+	return PodUnexpectedStopCauseNone
+}
+
 // syncTriggeredJob syncs jobs that do not yet have an associated test workload running
 func (r *reconciler) syncTriggeredJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
 	prevPJ := pj.DeepCopy()