Skip to content

Commit 48c5a23

Browse files
authored
Merge branch 'main' into restrict-webhook-config-resource-names
2 parents 2ffc3cd + 0767a6d commit 48c5a23

File tree

17 files changed

+640
-83
lines changed

17 files changed

+640
-83
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1414
- Added enforcement of the `nvidia` runtime class for GPU pods, with the option to enforce a custom runtime class, or disable enforcement entirely.
1515
- Added a preferred podAntiAffinity term by default for all services, can be set to required instead by setting `global.requireDefaultPodAffinityTerm`
1616
- Added support for service-level affinities
17+
- Added time aware scheduling configurations in scheduling shard
1718

1819
### Fixed
1920
- (Openshift only) - High CPU usage for the operator pod due to continues reconciles
2021
- Fixed a bug where the scheduler would not re-try updating podgroup status after failure
2122
- Fixed a bug where ray workloads gang scheduling would ignore `minReplicas` if autoscaling was not set
2223
- KAI Config wrong statuses when prometheus operand is enabled
24+
- GPU-Operator v25.10.0 support for CDI enabled environments
2325

2426
## [v0.9.1] - 20250-09-15
2527

cmd/podgrouper/app/app.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
2525
"sigs.k8s.io/controller-runtime/pkg/webhook"
2626

27-
"github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
27+
v2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
2828
kubeAiSchedulerV2alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2alpha2"
2929
controllers "github.com/NVIDIA/KAI-scheduler/pkg/podgrouper"
3030
pluginshub "github.com/NVIDIA/KAI-scheduler/pkg/podgrouper/podgrouper/hub"
@@ -51,7 +51,7 @@ func init() {
5151

5252
type App struct {
5353
Mgr manager.Manager
54-
DefaultPluginsHub pluginshub.PluginsHub
54+
DefaultPluginsHub *pluginshub.DefaultPluginsHub
5555

5656
configs controllers.Configs
5757
pluginsHub pluginshub.PluginsHub

deployments/kai-scheduler/crds/kai.scheduler_configs.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3256,8 +3256,6 @@ spec:
32563256
description: PodLabelSelector filters pods for webhooks and pod
32573257
grouper
32583258
type: object
3259-
prometheusEnabled:
3260-
type: boolean
32613259
queueLabelKey:
32623260
description: QueueLabelKey specifies the pod label key whose value
32633261
will be the queue name of the pod.

deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ spec:
5555
* Only valid flags defined in the scheduler's flag set will be accepted
5656
* Duplicated flags will override the behavior of flags generated by other fields
5757
type: object
58+
kValue:
59+
description: KValue specifies the kValue for the proportion plugin.
60+
Default is 1.0.
61+
type: number
5862
minRuntime:
5963
description: MinRuntime specifies the minimum runtime of a jobs in
6064
the shard
@@ -87,6 +91,54 @@ spec:
8791
description: QueueDepthPerAction max number of jobs to try for action
8892
per queue
8993
type: object
94+
usageDBConfig:
95+
description: UsageDBConfig defines configuration for the usage db
96+
client
97+
properties:
98+
clientType:
99+
type: string
100+
connectionString:
101+
type: string
102+
connectionStringEnvVar:
103+
type: string
104+
usageParams:
105+
description: UsageParams defines common params for all usage db
106+
clients. Some clients may not support all the params.
107+
properties:
108+
extraParams:
109+
additionalProperties:
110+
type: string
111+
description: ExtraParams are extra parameters for the usage
112+
db client, which are client specific.
113+
type: object
114+
fetchInterval:
115+
description: Fetch interval of the usage. Default is 1 minute.
116+
type: string
117+
halfLifePeriod:
118+
description: Half life period of the usage. If not set, or
119+
set to 0, the usage will not be decayed.
120+
type: string
121+
stalenessPeriod:
122+
description: Staleness period of the usage. Default is 5 minutes.
123+
type: string
124+
tumblingWindowCronString:
125+
description: A cron string used to determine when to reset
126+
resource usage for all queues.
127+
type: string
128+
waitTimeout:
129+
description: Wait timeout of the usage. Default is 1 minute.
130+
type: string
131+
windowSize:
132+
description: Window size of the usage. Default is 1 week.
133+
type: string
134+
windowType:
135+
description: Window type for time-series aggregation. If not
136+
set, defaults to sliding.
137+
type: string
138+
type: object
139+
required:
140+
- clientType
141+
type: object
90142
type: object
91143
status:
92144
description: SchedulingShardStatus defines the observed state of SchedulingShard

deployments/kai-scheduler/templates/rbac/prometheus-binding.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ metadata:
77
name: kai-prometheus
88
subjects:
99
- kind: ServiceAccount
10-
name: kai-prometheus
10+
name: prometheus
1111
namespace: kai-scheduler
1212
roleRef:
1313
kind: ClusterRole

pkg/apis/kai/v1/global.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,6 @@ type GlobalConfig struct {
6868
// +kubebuilder:validation:Optional
6969
PodLabelSelector map[string]string `json:"podLabelSelector,omitempty"`
7070

71-
// +kubebuilder:validation:Optional
72-
PrometheusEnabled *bool `json:"prometheusEnabled,omitempty"`
73-
7471
// Connection defines the connection configuration for TSDB
7572
// +kubebuilder:validation:Optional
7673
ExternalTSDBConnection *Connection `json:"connection,omitempty"`

pkg/apis/kai/v1/schedulingshard_types.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"k8s.io/utils/ptr"
2222

2323
"github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/common"
24+
usagedbapi "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
2425
)
2526

2627
const (
@@ -58,6 +59,14 @@ type SchedulingShardSpec struct {
5859
// MinRuntime specifies the minimum runtime of a jobs in the shard
5960
// +kubebuilder:validation:Optional
6061
MinRuntime *MinRuntime `json:"minRuntime,omitempty"`
62+
63+
// KValue specifies the kValue for the proportion plugin. Default is 1.0.
64+
// +kubebuilder:validation:Optional
65+
KValue *float64 `json:"kValue,omitempty"`
66+
67+
// UsageDBConfig defines configuration for the usage db client
68+
// +kubebuilder:validation:Optional
69+
UsageDBConfig *usagedbapi.UsageDBConfig `yaml:"usageDBConfig,omitempty" json:"usageDBConfig,omitempty"`
6170
}
6271

6372
func (s *SchedulingShardSpec) SetDefaultsWhereNeeded() {

pkg/apis/kai/v1/zz_generated.deepcopy.go

Lines changed: 9 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/operator/operands/binder/binder_test.go

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -96,26 +96,59 @@ var _ = Describe("Binder", func() {
9696
Expect(deployment.Spec.Template.Labels).To(HaveKeyWithValue("kai", "scheduler"))
9797
})
9898

99-
It("sets CDI flag if set in cluser policy", func(ctx context.Context) {
100-
clusterPolicy := &nvidiav1.ClusterPolicy{
101-
ObjectMeta: metav1.ObjectMeta{
102-
Name: "test",
103-
},
104-
Spec: nvidiav1.ClusterPolicySpec{
105-
CDI: nvidiav1.CDIConfigSpec{
106-
Enabled: ptr.To(true),
107-
Default: ptr.To(true),
99+
Context("CDI Detection", func() {
100+
var (
101+
clusterPolicy *nvidiav1.ClusterPolicy
102+
)
103+
BeforeEach(func() {
104+
clusterPolicy = &nvidiav1.ClusterPolicy{
105+
ObjectMeta: metav1.ObjectMeta{
106+
Name: "test",
108107
},
109-
},
110-
}
108+
Spec: nvidiav1.ClusterPolicySpec{
109+
CDI: nvidiav1.CDIConfigSpec{
110+
Enabled: ptr.To(true),
111+
Default: ptr.To(true),
112+
},
113+
},
114+
}
115+
})
111116

112-
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
113-
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
114-
Expect(err).To(BeNil())
117+
It("sets CDI flag if set in cluser policy", func(ctx context.Context) {
118+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
119+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
120+
Expect(err).To(BeNil())
115121

116-
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
117-
Expect(deploymentT).NotTo(BeNil())
118-
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
122+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
123+
Expect(deploymentT).NotTo(BeNil())
124+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
125+
})
126+
127+
It("sets CDI flag to false if not set by default cluser policy", func(ctx context.Context) {
128+
clusterPolicy.Spec.CDI.Default = ptr.To(false)
129+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
130+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
131+
Expect(err).To(BeNil())
132+
133+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
134+
Expect(deploymentT).NotTo(BeNil())
135+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=false"))
136+
})
137+
138+
It("detects CDI state with GPU Operator >= v25.10.0", func(ctx context.Context) {
139+
clusterPolicy.Labels = map[string]string{
140+
versionLabelName: gpuOperatorVersionDefaultCDIDeprecated,
141+
}
142+
clusterPolicy.Spec.CDI.Default = ptr.To(false)
143+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
144+
145+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
146+
Expect(err).To(BeNil())
147+
148+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
149+
Expect(deploymentT).NotTo(BeNil())
150+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
151+
})
119152
})
120153
})
121154

pkg/operator/operands/binder/resources.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"k8s.io/apimachinery/pkg/api/meta"
1515
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1616
"k8s.io/apimachinery/pkg/util/intstr"
17+
"k8s.io/apimachinery/pkg/version"
1718
"sigs.k8s.io/controller-runtime/pkg/client"
1819
"sigs.k8s.io/controller-runtime/pkg/log"
1920

@@ -26,7 +27,9 @@ import (
2627
)
2728

2829
const (
29-
defaultResourceName = "binder"
30+
defaultResourceName = "binder"
31+
gpuOperatorVersionDefaultCDIDeprecated = "v25.10.0"
32+
versionLabelName = "app.kubernetes.io/version"
3033
)
3134

3235
func (b *Binder) deploymentForKAIConfig(
@@ -184,6 +187,10 @@ func isCdiEnabled(ctx context.Context, readerClient client.Reader) (bool, error)
184187

185188
nvidiaClusterPolicy := nvidiaClusterPolicies.Items[0]
186189
if nvidiaClusterPolicy.Spec.CDI.Enabled != nil && *nvidiaClusterPolicy.Spec.CDI.Enabled {
190+
gpuOperatorVersion, found := nvidiaClusterPolicy.Labels[versionLabelName]
191+
if found && version.CompareKubeAwareVersionStrings(gpuOperatorVersion, gpuOperatorVersionDefaultCDIDeprecated) >= 0 {
192+
return true, nil
193+
}
187194
if nvidiaClusterPolicy.Spec.CDI.Default != nil && *nvidiaClusterPolicy.Spec.CDI.Default {
188195
return true, nil
189196
}

0 commit comments

Comments
 (0)