Skip to content

Commit f76f2a7

Browse files
authored
Add restart strategy (#686)
* Add restart strategy to API * Remove webhook, replace with openapi defaulting * Update creation logic to account for BlockingRecreate * Update integration test for existing Recreate strategy * Fix integration tests * Document restartStrategy by reusing existing max-restarts example * Address comment * Add enum validation * make generate * Fix issues with make generate * Fix example
1 parent 440f53e commit f76f2a7

File tree

19 files changed

+279
-16
lines changed

19 files changed

+279
-16
lines changed

api/jobset/v1alpha2/jobset_types.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,13 +307,31 @@ type FailurePolicy struct {
307307
// A restart is achieved by recreating all active child jobs.
308308
MaxRestarts int32 `json:"maxRestarts,omitempty"`
309309

310+
// RestartStrategy defines the strategy to use when restarting the JobSet.
311+
// Defaults to Recreate.
312+
// +optional
313+
// +kubebuilder:default=Recreate
314+
RestartStrategy JobSetRestartStrategy `json:"restartStrategy,omitempty"`
315+
310316
// List of failure policy rules for this JobSet.
311317
// For a given Job failure, the rules will be evaluated in order,
312318
// and only the first matching rule will be executed.
313319
// If no matching rule is found, the RestartJobSet action is applied.
314320
Rules []FailurePolicyRule `json:"rules,omitempty"`
315321
}
316322

323+
// +kubebuilder:validation:Enum=Recreate;BlockingRecreate
324+
type JobSetRestartStrategy string
325+
326+
const (
327+
// Recreate Jobs on a Job-by-Job basis.
328+
Recreate JobSetRestartStrategy = "Recreate"
329+
330+
// BlockingRecreate ensures that all Jobs (and Pods) from a previous iteration are deleted before
331+
// creating new Jobs.
332+
BlockingRecreate JobSetRestartStrategy = "BlockingRecreate"
333+
)
334+
317335
type SuccessPolicy struct {
318336
// Operator determines either All or Any of the selected jobs should succeed to consider the JobSet successful
319337
// +kubebuilder:validation:Enum=All;Any

api/jobset/v1alpha2/openapi_generated.go

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/jobset/v1alpha2/failurepolicy.go

Lines changed: 15 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ spec:
9797
A restart is achieved by recreating all active child jobs.
9898
format: int32
9999
type: integer
100+
restartStrategy:
101+
default: Recreate
102+
description: |-
103+
RestartStrategy defines the strategy to use when restarting the JobSet.
104+
Defaults to Recreate.
105+
enum:
106+
- Recreate
107+
- BlockingRecreate
108+
type: string
100109
rules:
101110
description: |-
102111
List of failure policy rules for this JobSet.

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module sigs.k8s.io/jobset
22

3-
go 1.23
3+
go 1.23.0
44

55
require (
66
github.com/google/go-cmp v0.6.0

hack/python-sdk/swagger.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
"type": "integer",
4040
"format": "int32"
4141
},
42+
"restartStrategy": {
43+
"description": "RestartStrategy defines the strategy to use when restarting the JobSet. Defaults to Recreate.",
44+
"type": "string"
45+
},
4246
"rules": {
4347
"description": "List of failure policy rules for this JobSet. For a given Job failure, the rules will be evaluated in order, and only the first matching rule will be executed. If no matching rule is found, the RestartJobSet action is applied.",
4448
"type": "array",

pkg/controllers/jobset_controller.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,9 @@ type childJobs struct {
6363
successful []*batchv1.Job
6464
failed []*batchv1.Job
6565

66-
// Jobs marked for deletion are mutually exclusive with the set of jobs in active, successful, and failed.
67-
delete []*batchv1.Job
66+
// Jobs from a previous restart (marked for deletion) are mutually exclusive
67+
// with the set of jobs in active, successful, and failed.
68+
previous []*batchv1.Job
6869
}
6970

7071
// statusUpdateOpts tracks if a JobSet status update should be performed at the end of the reconciliation
@@ -169,8 +170,8 @@ func (r *JobSetReconciler) reconcile(ctx context.Context, js *jobset.JobSet, upd
169170
return ctrl.Result{}, nil
170171
}
171172

172-
// Delete any jobs marked for deletion.
173-
if err := r.deleteJobs(ctx, ownedJobs.delete); err != nil {
173+
// Delete all jobs from a previous restart that are marked for deletion.
174+
if err := r.deleteJobs(ctx, ownedJobs.previous); err != nil {
174175
log.Error(err, "deleting jobs")
175176
return ctrl.Result{}, err
176177
}
@@ -281,11 +282,11 @@ func (r *JobSetReconciler) getChildJobs(ctx context.Context, js *jobset.JobSet)
281282
jobRestarts, err := strconv.Atoi(job.Labels[constants.RestartsKey])
282283
if err != nil {
283284
log.Error(err, fmt.Sprintf("invalid value for label %s, must be integer", constants.RestartsKey))
284-
ownedJobs.delete = append(ownedJobs.delete, &childJobList.Items[i])
285+
ownedJobs.previous = append(ownedJobs.previous, &childJobList.Items[i])
285286
return nil, err
286287
}
287288
if int32(jobRestarts) < js.Status.Restarts {
288-
ownedJobs.delete = append(ownedJobs.delete, &childJobList.Items[i])
289+
ownedJobs.previous = append(ownedJobs.previous, &childJobList.Items[i])
289290
continue
290291
}
291292

@@ -637,6 +638,13 @@ func executeSuccessPolicy(js *jobset.JobSet, ownedJobs *childJobs, updateStatusO
637638

638639
func constructJobsFromTemplate(js *jobset.JobSet, rjob *jobset.ReplicatedJob, ownedJobs *childJobs) []*batchv1.Job {
639640
var jobs []*batchv1.Job
641+
// If the JobSet is using the BlockingRecreate failure policy, we should not create any new jobs until
642+
// all the jobs slated for deletion (i.e. from the last restart index) have been deleted.
643+
useBlockingRecreate := js.Spec.FailurePolicy != nil && js.Spec.FailurePolicy.RestartStrategy == jobset.BlockingRecreate
644+
if len(ownedJobs.previous) > 0 && useBlockingRecreate {
645+
return jobs
646+
}
647+
640648
for jobIdx := 0; jobIdx < int(rjob.Replicas); jobIdx++ {
641649
jobName := placement.GenJobName(js.Name, rjob.Name, jobIdx)
642650
if create := shouldCreateJob(jobName, ownedJobs); !create {
@@ -700,7 +708,7 @@ func shouldCreateJob(jobName string, ownedJobs *childJobs) bool {
700708
// TODO: maybe we can use a job map here so we can do O(1) lookups
701709
// to check if the job already exists, rather than a linear scan
702710
// through all the jobs owned by the jobset.
703-
for _, job := range collections.Concat(ownedJobs.active, ownedJobs.successful, ownedJobs.failed, ownedJobs.delete) {
711+
for _, job := range collections.Concat(ownedJobs.active, ownedJobs.successful, ownedJobs.failed, ownedJobs.previous) {
704712
if jobName == job.Name {
705713
return false
706714
}

pkg/controllers/jobset_controller_test.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ func TestConstructJobsFromTemplate(t *testing.T) {
260260
Replicas(2).
261261
Obj()).Obj(),
262262
ownedJobs: &childJobs{
263-
delete: []*batchv1.Job{
263+
previous: []*batchv1.Job{
264264
testutils.MakeJob("test-jobset-replicated-job-0", ns).Obj(),
265265
},
266266
},
@@ -275,6 +275,23 @@ func TestConstructJobsFromTemplate(t *testing.T) {
275275
Suspend(false).Obj(),
276276
},
277277
},
278+
{
279+
name: "job creation blocked until all previous jobs no longer exist",
280+
js: testutils.MakeJobSet(jobSetName, ns).
281+
FailurePolicy(&jobset.FailurePolicy{
282+
RestartStrategy: jobset.BlockingRecreate,
283+
}).
284+
ReplicatedJob(testutils.MakeReplicatedJob(replicatedJobName).
285+
Job(testutils.MakeJobTemplate(jobName, ns).Obj()).
286+
Replicas(2).
287+
Obj()).Obj(),
288+
ownedJobs: &childJobs{
289+
previous: []*batchv1.Job{
290+
testutils.MakeJob("test-jobset-replicated-job-0", ns).Obj(),
291+
},
292+
},
293+
want: nil,
294+
},
278295
{
279296
name: "multiple replicated jobs",
280297
js: testutils.MakeJobSet(jobSetName, ns).

sdk/python/docs/JobsetV1alpha2FailurePolicy.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
Name | Type | Description | Notes
55
------------ | ------------- | ------------- | -------------
66
**max_restarts** | **int** | MaxRestarts defines the limit on the number of JobSet restarts. A restart is achieved by recreating all active child jobs. | [optional]
7+
**restart_strategy** | **str** | RestartStrategy defines the strategy to use when restarting the JobSet. Defaults to Recreate. | [optional]
78
**rules** | [**list[JobsetV1alpha2FailurePolicyRule]**](JobsetV1alpha2FailurePolicyRule.md) | List of failure policy rules for this JobSet. For a given Job failure, the rules will be evaluated in order, and only the first matching rule will be executed. If no matching rule is found, the RestartJobSet action is applied. | [optional]
89

910
[[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)

sdk/python/jobset/models/jobset_v1alpha2_failure_policy.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,26 +34,31 @@ class JobsetV1alpha2FailurePolicy(object):
3434
"""
3535
openapi_types = {
3636
'max_restarts': 'int',
37+
'restart_strategy': 'str',
3738
'rules': 'list[JobsetV1alpha2FailurePolicyRule]'
3839
}
3940

4041
attribute_map = {
4142
'max_restarts': 'maxRestarts',
43+
'restart_strategy': 'restartStrategy',
4244
'rules': 'rules'
4345
}
4446

45-
def __init__(self, max_restarts=None, rules=None, local_vars_configuration=None): # noqa: E501
47+
def __init__(self, max_restarts=None, restart_strategy=None, rules=None, local_vars_configuration=None): # noqa: E501
4648
"""JobsetV1alpha2FailurePolicy - a model defined in OpenAPI""" # noqa: E501
4749
if local_vars_configuration is None:
4850
local_vars_configuration = Configuration()
4951
self.local_vars_configuration = local_vars_configuration
5052

5153
self._max_restarts = None
54+
self._restart_strategy = None
5255
self._rules = None
5356
self.discriminator = None
5457

5558
if max_restarts is not None:
5659
self.max_restarts = max_restarts
60+
if restart_strategy is not None:
61+
self.restart_strategy = restart_strategy
5762
if rules is not None:
5863
self.rules = rules
5964

@@ -80,6 +85,29 @@ def max_restarts(self, max_restarts):
8085

8186
self._max_restarts = max_restarts
8287

88+
@property
89+
def restart_strategy(self):
90+
"""Gets the restart_strategy of this JobsetV1alpha2FailurePolicy. # noqa: E501
91+
92+
RestartStrategy defines the strategy to use when restarting the JobSet. Defaults to Recreate. # noqa: E501
93+
94+
:return: The restart_strategy of this JobsetV1alpha2FailurePolicy. # noqa: E501
95+
:rtype: str
96+
"""
97+
return self._restart_strategy
98+
99+
@restart_strategy.setter
100+
def restart_strategy(self, restart_strategy):
101+
"""Sets the restart_strategy of this JobsetV1alpha2FailurePolicy.
102+
103+
RestartStrategy defines the strategy to use when restarting the JobSet. Defaults to Recreate. # noqa: E501
104+
105+
:param restart_strategy: The restart_strategy of this JobsetV1alpha2FailurePolicy. # noqa: E501
106+
:type: str
107+
"""
108+
109+
self._restart_strategy = restart_strategy
110+
83111
@property
84112
def rules(self):
85113
"""Gets the rules of this JobsetV1alpha2FailurePolicy. # noqa: E501

0 commit comments

Comments
 (0)