Skip to content

Commit 5eb9a2a

Browse files
authored
Merge pull request #423 from danielvegamyhre/automated-cherry-pick-of-#421-upstream-release-0.3
Automated cherry pick of #421: add clearer error message for pod name too long
2 parents b45f339 + 1926986 commit 5eb9a2a

File tree

3 files changed

+39
-6
lines changed

3 files changed

+39
-6
lines changed

api/jobset/v1alpha2/jobset_webhook.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"fmt"
1919
"math"
2020
"strconv"
21+
"strings"
2122

2223
apivalidation "k8s.io/apimachinery/pkg/api/validation"
2324
"k8s.io/apimachinery/pkg/runtime"
@@ -36,6 +37,24 @@ import (
3637
corev1 "k8s.io/api/core/v1"
3738
)
3839

40+
const (
41+
// This is the error message returned by IsDNS1035Label when the given input
42+
// is longer than 63 characters.
43+
dns1035MaxLengthExceededErrorMsg = "must be no more than 63 characters"
44+
45+
// Error message returned by JobSet validation if the generated child jobs
46+
// will be longer than 63 characters.
47+
jobNameTooLongErrorMsg = "JobSet name is too long, job names generated for this JobSet will exceed 63 characters"
48+
49+
// Error message returned by JobSet validation if the generated pod names
50+
// will be longer than 63 characters.
51+
podNameTooLongErrorMsg = "JobSet name is too long, pod names generated for this JobSet will exceed 63 characters"
52+
53+
// Error message returned by JobSet validation if the network subdomain
54+
// will be longer than 63 characters.
55+
subdomainTooLongErrMsg = ".spec.network.subdomain is too long, must be less than 63 characters"
56+
)
57+
3958
func (js *JobSet) SetupWebhookWithManager(mgr ctrl.Manager) error {
4059
return ctrl.NewWebhookManagedBy(mgr).
4160
For(js).
@@ -92,6 +111,9 @@ func (js *JobSet) ValidateCreate() (admission.Warnings, error) {
92111

93112
// Since subdomain name is also used as service name, it must adhere to RFC 1035 as well.
94113
for _, errMessage := range validation.IsDNS1035Label(js.Spec.Network.Subdomain) {
114+
if strings.Contains(errMessage, dns1035MaxLengthExceededErrorMsg) {
115+
errMessage = subdomainTooLongErrMsg
116+
}
95117
allErrs = append(allErrs, fmt.Errorf(errMessage))
96118
}
97119
}
@@ -108,6 +130,9 @@ func (js *JobSet) ValidateCreate() (admission.Warnings, error) {
108130
// Use the largest job index as it will have the longest name.
109131
longestJobName := placement.GenJobName(js.Name, rjob.Name, int(rjob.Replicas-1))
110132
for _, errMessage := range validation.IsDNS1035Label(longestJobName) {
133+
if strings.Contains(errMessage, dns1035MaxLengthExceededErrorMsg) {
134+
errMessage = jobNameTooLongErrorMsg
135+
}
111136
allErrs = append(allErrs, fmt.Errorf(errMessage))
112137
}
113138
// Check that the generated pod names for the replicated job is DNS 1035 compliant.
@@ -118,6 +143,9 @@ func (js *JobSet) ValidateCreate() (admission.Warnings, error) {
118143
// Add 5 char suffix to the deterministic part of the pod name to validate the full pod name is compliant.
119144
longestPodName := placement.GenPodName(js.Name, rjob.Name, maxJobIndex, maxPodIndex) + "-abcde"
120145
for _, errMessage := range validation.IsDNS1035Label(longestPodName) {
146+
if strings.Contains(errMessage, dns1035MaxLengthExceededErrorMsg) {
147+
errMessage = podNameTooLongErrorMsg
148+
}
121149
allErrs = append(allErrs, fmt.Errorf(errMessage))
122150
}
123151
}

api/jobset/v1alpha2/jobset_webhook_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,7 @@ func TestValidateCreate(t *testing.T) {
463463
},
464464
},
465465
want: errors.Join(
466-
fmt.Errorf("must be no more than 63 characters"),
466+
fmt.Errorf(subdomainTooLongErrMsg),
467467
),
468468
},
469469
{
@@ -507,7 +507,7 @@ func TestValidateCreate(t *testing.T) {
507507
},
508508
},
509509
want: errors.Join(
510-
fmt.Errorf("must be no more than 63 characters"),
510+
fmt.Errorf(jobNameTooLongErrorMsg),
511511
),
512512
},
513513
{
@@ -534,7 +534,7 @@ func TestValidateCreate(t *testing.T) {
534534
},
535535
},
536536
want: errors.Join(
537-
fmt.Errorf("must be no more than 63 characters"),
537+
fmt.Errorf(podNameTooLongErrorMsg),
538538
),
539539
},
540540
}

pkg/webhooks/pod_admission_webhook.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ func (p *podWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (ad
6161
return nil, err
6262
}
6363
if !leaderScheduled {
64-
return nil, fmt.Errorf("leader pod not yet scheduled, not creating follower pod %q", pod.Name)
64+
return nil, fmt.Errorf("leader pod not yet scheduled, not creating follower pod. this is an expected, transient error.")
6565
}
6666
return nil, nil
6767
}
@@ -75,11 +75,16 @@ func (p *podWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (ad
7575
}
7676

7777
func (p *podWebhook) leaderPodScheduled(ctx context.Context, pod *corev1.Pod) (bool, error) {
78+
log := ctrl.LoggerFrom(ctx)
7879
leaderPod, err := p.leaderPodForFollower(ctx, pod)
7980
if err != nil {
8081
return false, err
8182
}
82-
return leaderPod.Spec.NodeName != "", nil
83+
scheduled := leaderPod.Spec.NodeName != ""
84+
if !scheduled {
85+
log.V(3).Info("leader pod %s is not yet scheduled", leaderPod.Name)
86+
}
87+
return scheduled, nil
8388
}
8489

8590
func (p *podWebhook) leaderPodForFollower(ctx context.Context, pod *corev1.Pod) (*corev1.Pod, error) {
@@ -99,7 +104,7 @@ func (p *podWebhook) leaderPodForFollower(ctx context.Context, pod *corev1.Pod)
99104

100105
// Validate there is only 1 leader pod for this job.
101106
if len(podList.Items) != 1 {
102-
return nil, fmt.Errorf("incorrect number of leader pods for this job (expected 1, got %d)", len(podList.Items))
107+
return nil, fmt.Errorf("expected 1 leader pod (%s), but got %d. this is an expected, transient error.", leaderPodName, len(podList.Items))
103108
}
104109

105110
// Check if the leader pod is scheduled.

0 commit comments

Comments
 (0)