Skip to content

Commit 33ccb26

Browse files
authored
Merge branch 'main' into erez/support-gpu-operator-25-10-0-better
2 parents b2b6044 + d65607a commit 33ccb26

File tree

15 files changed

+581
-65
lines changed

15 files changed

+581
-65
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1414
- Added enforcement of the `nvidia` runtime class for GPU pods, with the option to enforce a custom runtime class, or disable enforcement entirely.
1515
- Added a preferred podAntiAffinity term by default for all services, can be set to required instead by setting `global.requireDefaultPodAffinityTerm`
1616
- Added support for service-level affinities
17+
- Added time aware scheduling configurations in scheduling shard
1718

1819
### Fixed
1920
- (Openshift only) - High CPU usage for the operator pod due to continues reconciles

cmd/podgrouper/app/app.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
2525
"sigs.k8s.io/controller-runtime/pkg/webhook"
2626

27-
"github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
27+
v2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
2828
kubeAiSchedulerV2alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2alpha2"
2929
controllers "github.com/NVIDIA/KAI-scheduler/pkg/podgrouper"
3030
pluginshub "github.com/NVIDIA/KAI-scheduler/pkg/podgrouper/podgrouper/hub"
@@ -51,7 +51,7 @@ func init() {
5151

5252
type App struct {
5353
Mgr manager.Manager
54-
DefaultPluginsHub pluginshub.PluginsHub
54+
DefaultPluginsHub *pluginshub.DefaultPluginsHub
5555

5656
configs controllers.Configs
5757
pluginsHub pluginshub.PluginsHub

deployments/kai-scheduler/crds/kai.scheduler_configs.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3256,8 +3256,6 @@ spec:
32563256
description: PodLabelSelector filters pods for webhooks and pod
32573257
grouper
32583258
type: object
3259-
prometheusEnabled:
3260-
type: boolean
32613259
queueLabelKey:
32623260
description: QueueLabelKey specifies the pod label key whose value
32633261
will be the queue name of the pod.

deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ spec:
5555
* Only valid flags defined in the scheduler's flag set will be accepted
5656
* Duplicated flags will override the behavior of flags generated by other fields
5757
type: object
58+
kValue:
59+
description: KValue specifies the kValue for the proportion plugin.
60+
Default is 1.0.
61+
type: number
5862
minRuntime:
5963
description: MinRuntime specifies the minimum runtime of a jobs in
6064
the shard
@@ -87,6 +91,54 @@ spec:
8791
description: QueueDepthPerAction max number of jobs to try for action
8892
per queue
8993
type: object
94+
usageDBConfig:
95+
description: UsageDBConfig defines configuration for the usage db
96+
client
97+
properties:
98+
clientType:
99+
type: string
100+
connectionString:
101+
type: string
102+
connectionStringEnvVar:
103+
type: string
104+
usageParams:
105+
description: UsageParams defines common params for all usage db
106+
clients. Some clients may not support all the params.
107+
properties:
108+
extraParams:
109+
additionalProperties:
110+
type: string
111+
description: ExtraParams are extra parameters for the usage
112+
db client, which are client specific.
113+
type: object
114+
fetchInterval:
115+
description: Fetch interval of the usage. Default is 1 minute.
116+
type: string
117+
halfLifePeriod:
118+
description: Half life period of the usage. If not set, or
119+
set to 0, the usage will not be decayed.
120+
type: string
121+
stalenessPeriod:
122+
description: Staleness period of the usage. Default is 5 minutes.
123+
type: string
124+
tumblingWindowCronString:
125+
description: A cron string used to determine when to reset
126+
resource usage for all queues.
127+
type: string
128+
waitTimeout:
129+
description: Wait timeout of the usage. Default is 1 minute.
130+
type: string
131+
windowSize:
132+
description: Window size of the usage. Default is 1 week.
133+
type: string
134+
windowType:
135+
description: Window type for time-series aggregation. If not
136+
set, defaults to sliding.
137+
type: string
138+
type: object
139+
required:
140+
- clientType
141+
type: object
90142
type: object
91143
status:
92144
description: SchedulingShardStatus defines the observed state of SchedulingShard

deployments/kai-scheduler/templates/rbac/prometheus-binding.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ metadata:
77
name: kai-prometheus
88
subjects:
99
- kind: ServiceAccount
10-
name: kai-prometheus
10+
name: prometheus
1111
namespace: kai-scheduler
1212
roleRef:
1313
kind: ClusterRole

pkg/apis/kai/v1/global.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,6 @@ type GlobalConfig struct {
6868
// +kubebuilder:validation:Optional
6969
PodLabelSelector map[string]string `json:"podLabelSelector,omitempty"`
7070

71-
// +kubebuilder:validation:Optional
72-
PrometheusEnabled *bool `json:"prometheusEnabled,omitempty"`
73-
7471
// Connection defines the connection configuration for TSDB
7572
// +kubebuilder:validation:Optional
7673
ExternalTSDBConnection *Connection `json:"connection,omitempty"`

pkg/apis/kai/v1/schedulingshard_types.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"k8s.io/utils/ptr"
2222

2323
"github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/common"
24+
usagedbapi "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
2425
)
2526

2627
const (
@@ -58,6 +59,14 @@ type SchedulingShardSpec struct {
5859
// MinRuntime specifies the minimum runtime of a jobs in the shard
5960
// +kubebuilder:validation:Optional
6061
MinRuntime *MinRuntime `json:"minRuntime,omitempty"`
62+
63+
// KValue specifies the kValue for the proportion plugin. Default is 1.0.
64+
// +kubebuilder:validation:Optional
65+
KValue *float64 `json:"kValue,omitempty"`
66+
67+
// UsageDBConfig defines configuration for the usage db client
68+
// +kubebuilder:validation:Optional
69+
UsageDBConfig *usagedbapi.UsageDBConfig `yaml:"usageDBConfig,omitempty" json:"usageDBConfig,omitempty"`
6170
}
6271

6372
func (s *SchedulingShardSpec) SetDefaultsWhereNeeded() {

pkg/apis/kai/v1/zz_generated.deepcopy.go

Lines changed: 9 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/operator/operands/common/common.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,13 @@ var KaiServicesForServiceMonitor = []struct {
3131
Name string
3232
Port string
3333
JobLabel string
34+
35+
LabelSelector map[string]string // optional, if not provided, "app": NAME will be used
36+
Namespaces []string // optional, if not provided, the service will be monitored in the namespace of the KAI config
3437
}{
35-
{"queue-controller", "metrics", "queue-controller"},
38+
{"queue-controller", "metrics", "queue-controller", nil, nil},
39+
{"kube-state-metrics", "http", "kube-state-metrics",
40+
map[string]string{"app.kubernetes.io/name": "kube-state-metrics"}, []string{"monitoring", "default"}},
3641
}
3742

3843
func AllControllersAvailable(

pkg/operator/operands/prometheus/resources.go

Lines changed: 17 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ import (
2424
)
2525

2626
const (
27-
mainResourceName = "prometheus"
27+
mainResourceName = "prometheus"
28+
serviceMonitorAccountingLabel = "accounting"
29+
serviceMonitorAccountingValue = "kai"
2830
)
2931

3032
func prometheusForKAIConfig(
@@ -96,7 +98,7 @@ func prometheusForKAIConfig(
9698
if config.ServiceMonitor != nil && *config.ServiceMonitor.Enabled {
9799
prometheusSpec.ServiceMonitorSelector = &metav1.LabelSelector{
98100
MatchLabels: map[string]string{
99-
"accounting": mainResourceName,
101+
serviceMonitorAccountingLabel: serviceMonitorAccountingValue,
100102
},
101103
}
102104
prometheusSpec.ServiceMonitorNamespaceSelector = &metav1.LabelSelector{}
@@ -136,18 +138,26 @@ func serviceMonitorsForKAIConfig(
136138
return nil, err
137139
}
138140

139-
serviceMonitorObj.GetLabels()["accounting"] = mainResourceName
141+
serviceMonitorObj.GetLabels()[serviceMonitorAccountingLabel] = serviceMonitorAccountingValue
142+
143+
namespaces := []string{kaiConfig.Spec.Namespace}
144+
if kaiService.Namespaces != nil {
145+
namespaces = kaiService.Namespaces
146+
}
147+
148+
labelSelector := map[string]string{"app": kaiService.Name}
149+
if kaiService.LabelSelector != nil {
150+
labelSelector = kaiService.LabelSelector
151+
}
140152

141153
// Set the ServiceMonitor spec from configuration
142154
serviceMonitorSpec := monitoringv1.ServiceMonitorSpec{
143155
JobLabel: kaiService.JobLabel,
144156
NamespaceSelector: monitoringv1.NamespaceSelector{
145-
MatchNames: []string{kaiConfig.Spec.Namespace},
157+
MatchNames: namespaces,
146158
},
147159
Selector: metav1.LabelSelector{
148-
MatchLabels: map[string]string{
149-
"app": kaiService.Name,
150-
},
160+
MatchLabels: labelSelector,
151161
},
152162
Endpoints: []monitoringv1.Endpoint{
153163
{
@@ -172,35 +182,6 @@ func serviceMonitorsForKAIConfig(
172182
serviceMonitors = append(serviceMonitors, serviceMonitorObj)
173183
}
174184

175-
kubeStateMetric := &monitoringv1.ServiceMonitor{
176-
ObjectMeta: metav1.ObjectMeta{
177-
Name: "kube-state-metrics",
178-
Namespace: kaiConfig.Spec.Namespace,
179-
Labels: map[string]string{
180-
"accounting": mainResourceName,
181-
},
182-
},
183-
Spec: monitoringv1.ServiceMonitorSpec{
184-
JobLabel: "kube-state-metrics",
185-
NamespaceSelector: monitoringv1.NamespaceSelector{
186-
MatchNames: []string{"monitoring", "default"},
187-
},
188-
Selector: metav1.LabelSelector{
189-
MatchLabels: map[string]string{
190-
"app.kubernetes.io/name": "kube-state-metrics",
191-
},
192-
},
193-
Endpoints: []monitoringv1.Endpoint{
194-
{
195-
Port: "http",
196-
BearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
197-
Interval: "30s",
198-
},
199-
},
200-
},
201-
}
202-
203-
serviceMonitors = append(serviceMonitors, kubeStateMetric)
204185
return serviceMonitors, nil
205186
}
206187

0 commit comments

Comments
 (0)