Skip to content

Commit 1b65a13

Browse files
authored
Build topology trees for the scheduler plugin (#301)
1 parent 4ac7c2c commit 1b65a13

File tree

32 files changed

+1437
-219
lines changed

32 files changed

+1437
-219
lines changed

.github/workflows/on-pr.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
- name: Set up Go
2222
uses: actions/setup-go@v5
2323
with:
24-
go-version: '1.23.4'
24+
go-version: '1.24.4'
2525

2626
- name: Run validation
2727
run: make validate
@@ -85,7 +85,7 @@ jobs:
8585
- name: Set up Go
8686
uses: actions/setup-go@v5
8787
with:
88-
go-version: '1.23.4'
88+
go-version: '1.24.4'
8989

9090
- name: Set up Docker Buildx
9191
uses: docker/setup-buildx-action@v3

.github/workflows/on-release.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
- name: Set up Go
2828
uses: actions/setup-go@v5
2929
with:
30-
go-version: '1.23.4'
30+
go-version: '1.24.4'
3131

3232
- name: Set up Docker Buildx
3333
uses: docker/setup-buildx-action@v3

.github/workflows/push-artifacts.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
- name: Set up Go
4242
uses: actions/setup-go@v5
4343
with:
44-
go-version: '1.23.4'
44+
go-version: '1.24.4'
4545

4646
- name: Log in to GitHub Container Registry
4747
uses: docker/login-action@v3

.github/workflows/update-coverage-badge.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
- name: Set up Go
2323
uses: actions/setup-go@v5
2424
with:
25-
go-version: '1.23.4'
25+
go-version: '1.24.4'
2626

2727
- name: Run tests with coverage
2828
run: make test

build/builder/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Copyright 2025 NVIDIA CORPORATION
22
# SPDX-License-Identifier: Apache-2.0
33

4-
FROM golang:1.23.4-bullseye AS builder
4+
FROM golang:1.24.4-bullseye AS builder
55

66
RUN apt-get update && apt-get install -y \
77
g++-x86-64-linux-gnu \

build/makefile/golang.mk

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ GOPATH_HOST_DIR=${HOME}/.cache/go-build-docker-gopath
1515
DOCKER_GO_CACHING_VOLUME_AND_ENV := -v ${GOPATH_HOST_DIR}:/go:z -v ${GOCACHE_HOST_DIR}:${GOCACHE_DOCKER_DIR}:z -e GOPATH=/go -e GOCACHE=${GOCACHE_DOCKER_DIR} -e GOLANGCI_LINT_CACHE=${GOCACHE_DOCKER_DIR}
1616

1717
## Version
18-
GO_VERSION=1.23.4
18+
GO_VERSION=1.24.4
1919
GO_IMAGE_VERSION=${GO_VERSION}-bullseye
20-
GOLANGCI_LINT_VERSION=v1.60.1
20+
GOLANGCI_LINT_VERSION=v1.64.8
2121

2222
## Tool Versions
2323
CGO_ENABLED?=1

cmd/snapshot-tool/main.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/metrics"
2828
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins"
2929
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/snapshot"
30+
kueuefake "sigs.k8s.io/kueue/client-go/clientset/versioned/fake"
3031
)
3132

3233
func main() {
@@ -59,11 +60,12 @@ func main() {
5960
actions.InitDefaultActions()
6061
plugins.InitDefaultPlugins()
6162

62-
kubeClient, kaiClient := loadClientsWithSnapshot(snapshot.RawObjects)
63+
kubeClient, kaiClient, kueueClient := loadClientsWithSnapshot(snapshot.RawObjects)
6364

6465
schedulerCacheParams := &cache.SchedulerCacheParams{
6566
KubeClient: kubeClient,
6667
KAISchedulerClient: kaiClient,
68+
KueueClient: kueueClient,
6769
SchedulerName: snapshot.SchedulerParams.SchedulerName,
6870
NodePoolParams: snapshot.SchedulerParams.PartitionParams,
6971
RestrictNodeScheduling: snapshot.SchedulerParams.RestrictSchedulingNodes,
@@ -126,9 +128,10 @@ func loadSnapshot(filename string) (*snapshot.Snapshot, error) {
126128
return nil, os.ErrNotExist
127129
}
128130

129-
func loadClientsWithSnapshot(rawObjects *snapshot.RawKubernetesObjects) (*fake.Clientset, *kaischedulerfake.Clientset) {
131+
func loadClientsWithSnapshot(rawObjects *snapshot.RawKubernetesObjects) (*fake.Clientset, *kaischedulerfake.Clientset, *kueuefake.Clientset) {
130132
kubeClient := fake.NewSimpleClientset()
131133
kaiClient := kaischedulerfake.NewSimpleClientset()
134+
kueueClient := kueuefake.NewSimpleClientset()
132135

133136
for _, pod := range rawObjects.Pods {
134137
_, err := kubeClient.CoreV1().Pods(pod.Namespace).Create(context.TODO(), pod, v1.CreateOptions{})
@@ -235,5 +238,5 @@ func loadClientsWithSnapshot(rawObjects *snapshot.RawKubernetesObjects) (*fake.C
235238
}
236239
}
237240

238-
return kubeClient, kaiClient
241+
return kubeClient, kaiClient, kueueClient
239242
}

deployments/kai-scheduler/templates/rbac/scheduler.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,14 @@ rules:
5353
- list
5454
- patch
5555
- update
56+
- apiGroups:
57+
- kueue.x-k8s.io
58+
resources:
59+
- topologies
60+
verbs:
61+
- get
62+
- list
63+
- watch
5664
- apiGroups:
5765
- policy
5866
resources:

deployments/kai-scheduler/templates/services/scheduler-configmap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ data:
2727
cpu: {{ .Values.scheduler.placementStrategy }}
2828
gpu: {{ .Values.scheduler.placementStrategy }}
2929
- name: minruntime
30+
- name: topology
3031
kind: ConfigMap
3132
metadata:
3233
labels:

go.mod

Lines changed: 60 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/NVIDIA/KAI-scheduler
22

3-
go 1.23.4
3+
go 1.24.4
44

55
require (
66
github.com/NVIDIA/go-nvml v0.12.4-1
@@ -13,12 +13,12 @@ require (
1313
github.com/golang/glog v1.2.4
1414
github.com/grafana/pyroscope-go v1.2.1
1515
github.com/kubeflow/mpi-operator v0.6.0
16-
github.com/kubeflow/training-operator v1.9.1
16+
github.com/kubeflow/training-operator v1.9.2
1717
github.com/onsi/ginkgo v1.16.5
18-
github.com/onsi/ginkgo/v2 v2.23.3
18+
github.com/onsi/ginkgo/v2 v2.23.4
1919
github.com/onsi/gomega v1.37.0
2020
github.com/pkg/errors v0.9.1
21-
github.com/prometheus/client_golang v1.20.5
21+
github.com/prometheus/client_golang v1.22.0
2222
github.com/ray-project/kuberay/ray-operator v1.3.1
2323
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7
2424
github.com/spf13/pflag v1.0.6
@@ -28,22 +28,22 @@ require (
2828
go.uber.org/mock v0.5.0
2929
go.uber.org/multierr v1.11.0
3030
go.uber.org/zap v1.27.0
31-
golang.org/x/exp v0.0.0-20250207012021-f9890c6ad9f3
32-
gomodules.xyz/jsonpatch/v2 v2.4.0
31+
golang.org/x/exp v0.0.0-20250305212735-054e65f0b394
32+
gomodules.xyz/jsonpatch/v2 v2.5.0
3333
google.golang.org/grpc v1.69.2
3434
gopkg.in/h2non/gock.v1 v1.1.2
3535
gopkg.in/yaml.v2 v2.4.0
3636
gopkg.in/yaml.v3 v3.0.1
3737
gotest.tools v2.2.0+incompatible
38-
k8s.io/api v0.32.4
38+
k8s.io/api v0.32.5
3939
k8s.io/apiextensions-apiserver v0.32.4
40-
k8s.io/apimachinery v0.32.4
41-
k8s.io/apiserver v0.32.4
42-
k8s.io/cli-runtime v0.32.4
43-
k8s.io/client-go v0.32.4
40+
k8s.io/apimachinery v0.32.5
41+
k8s.io/apiserver v0.32.5
42+
k8s.io/cli-runtime v0.32.5
43+
k8s.io/client-go v0.32.5
4444
k8s.io/cluster-bootstrap v0.32.4
45-
k8s.io/component-base v0.32.4
46-
k8s.io/component-helpers v0.32.4
45+
k8s.io/component-base v0.32.5
46+
k8s.io/component-helpers v0.32.5
4747
k8s.io/cri-client v0.32.4
4848
k8s.io/dynamic-resource-allocation v0.32.4
4949
k8s.io/endpointslice v0.32.4
@@ -52,27 +52,28 @@ require (
5252
k8s.io/kube-aggregator v0.32.4
5353
k8s.io/kube-controller-manager v0.32.4
5454
k8s.io/kube-proxy v0.32.4
55-
k8s.io/kubectl v0.32.4
55+
k8s.io/kubectl v0.32.5
5656
k8s.io/kubernetes v1.32.6
57-
k8s.io/metrics v0.32.4
57+
k8s.io/metrics v0.32.5
5858
k8s.io/mount-utils v0.32.4
5959
k8s.io/pod-security-admission v0.32.4
6060
k8s.io/sample-apiserver v0.32.4
6161
k8s.io/utils v0.0.0-20241210054802-24370beab758
6262
knative.dev/pkg v0.0.0-20250117084104-c43477f0052b
6363
knative.dev/serving v0.44.0
64-
sigs.k8s.io/controller-runtime v0.20.0
64+
sigs.k8s.io/controller-runtime v0.20.4
6565
sigs.k8s.io/karpenter v1.2.0
66-
sigs.k8s.io/lws v0.5.1
66+
sigs.k8s.io/kueue v0.12.3
67+
sigs.k8s.io/lws v0.6.1
6768
)
6869

6970
require (
70-
cel.dev/expr v0.18.0 // indirect
71+
cel.dev/expr v0.19.1 // indirect
7172
github.com/Microsoft/go-winio v0.6.2 // indirect
7273
github.com/NVIDIA/k8s-kata-manager v0.2.0 // indirect
7374
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240627150410-078e3039ecf7 // indirect
7475
github.com/NYTimes/gziphandler v1.1.1 // indirect
75-
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
76+
github.com/antlr4-go/antlr/v4 v4.13.1 // indirect
7677
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
7778
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 // indirect
7879
github.com/beorn7/perks v1.0.1 // indirect
@@ -88,18 +89,18 @@ require (
8889
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
8990
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
9091
github.com/distribution/reference v0.6.0 // indirect
91-
github.com/emicklei/go-restful/v3 v3.12.1 // indirect
92+
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
9293
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
9394
github.com/felixge/httpsnoop v1.0.4 // indirect
94-
github.com/fsnotify/fsnotify v1.8.0 // indirect
95+
github.com/fsnotify/fsnotify v1.9.0 // indirect
9596
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
9697
github.com/gabriel-vasile/mimetype v1.4.7 // indirect
9798
github.com/gin-contrib/sse v0.1.0 // indirect
9899
github.com/go-logr/stdr v1.2.2 // indirect
99100
github.com/go-logr/zapr v1.3.0 // indirect
100-
github.com/go-openapi/jsonpointer v0.21.0 // indirect
101+
github.com/go-openapi/jsonpointer v0.21.1 // indirect
101102
github.com/go-openapi/jsonreference v0.21.0 // indirect
102-
github.com/go-openapi/swag v0.23.0 // indirect
103+
github.com/go-openapi/swag v0.23.1 // indirect
103104
github.com/go-playground/locales v0.14.1 // indirect
104105
github.com/go-playground/universal-translator v0.18.1 // indirect
105106
github.com/go-playground/validator/v10 v10.23.0 // indirect
@@ -108,23 +109,23 @@ require (
108109
github.com/gogo/protobuf v1.3.2 // indirect
109110
github.com/golang/protobuf v1.5.4 // indirect
110111
github.com/google/btree v1.1.3 // indirect
111-
github.com/google/cel-go v0.22.0 // indirect
112+
github.com/google/cel-go v0.22.1 // indirect
112113
github.com/google/gnostic-models v0.6.9 // indirect
113114
github.com/google/go-cmp v0.7.0 // indirect
114115
github.com/google/go-containerregistry v0.17.0 // indirect
115116
github.com/google/gofuzz v1.2.0 // indirect
116-
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
117+
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
117118
github.com/google/uuid v1.6.0 // indirect
118119
github.com/gorilla/websocket v1.5.3 // indirect
119120
github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect
120121
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
121122
github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect
122-
github.com/grpc-ecosystem/grpc-gateway/v2 v2.21.0 // indirect
123+
github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect
123124
github.com/h2non/parth v0.0.0-20190131123155-b4df798d6542 // indirect
124125
github.com/inconshreveable/mousetrap v1.1.0 // indirect
125126
github.com/josharian/intern v1.0.0 // indirect
126127
github.com/json-iterator/go v1.1.12 // indirect
127-
github.com/klauspost/compress v1.17.11 // indirect
128+
github.com/klauspost/compress v1.18.0 // indirect
128129
github.com/klauspost/cpuid/v2 v2.2.9 // indirect
129130
github.com/kylelemons/godebug v1.1.0 // indirect
130131
github.com/leodido/go-urn v1.4.0 // indirect
@@ -144,45 +145,47 @@ require (
144145
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
145146
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
146147
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.73.2 // indirect
147-
github.com/prometheus/client_model v0.6.1 // indirect
148-
github.com/prometheus/common v0.62.0 // indirect
149-
github.com/prometheus/procfs v0.15.1 // indirect
148+
github.com/prometheus/client_model v0.6.2 // indirect
149+
github.com/prometheus/common v0.63.0 // indirect
150+
github.com/prometheus/procfs v0.16.0 // indirect
150151
github.com/robfig/cron/v3 v3.0.1 // indirect
151152
github.com/samber/lo v1.47.0 // indirect
152153
github.com/sirupsen/logrus v1.9.3 // indirect
153-
github.com/spf13/cobra v1.8.1 // indirect
154+
github.com/spf13/cobra v1.9.1 // indirect
154155
github.com/stoewer/go-strcase v1.3.0 // indirect
155156
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
156157
github.com/ugorji/go/codec v1.2.12 // indirect
157158
github.com/x448/float16 v0.8.4 // indirect
158-
go.etcd.io/etcd/api/v3 v3.5.16 // indirect
159-
go.etcd.io/etcd/client/pkg/v3 v3.5.16 // indirect
160-
go.etcd.io/etcd/client/v3 v3.5.16 // indirect
161-
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 // indirect
162-
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 // indirect
163-
go.opentelemetry.io/otel v1.31.0 // indirect
164-
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect
165-
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect
166-
go.opentelemetry.io/otel/metric v1.31.0 // indirect
167-
go.opentelemetry.io/otel/sdk v1.31.0 // indirect
168-
go.opentelemetry.io/otel/trace v1.31.0 // indirect
169-
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
159+
go.etcd.io/etcd/api/v3 v3.5.17 // indirect
160+
go.etcd.io/etcd/client/pkg/v3 v3.5.17 // indirect
161+
go.etcd.io/etcd/client/v3 v3.5.17 // indirect
162+
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
163+
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0 // indirect
164+
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect
165+
go.opentelemetry.io/otel v1.35.0 // indirect
166+
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect
167+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect
168+
go.opentelemetry.io/otel/metric v1.35.0 // indirect
169+
go.opentelemetry.io/otel/sdk v1.33.0 // indirect
170+
go.opentelemetry.io/otel/trace v1.35.0 // indirect
171+
go.opentelemetry.io/proto/otlp v1.4.0 // indirect
172+
go.uber.org/automaxprocs v1.6.0 // indirect
170173
golang.org/x/arch v0.12.0 // indirect
171174
golang.org/x/crypto v0.36.0 // indirect
172-
golang.org/x/mod v0.23.0 // indirect
175+
golang.org/x/mod v0.24.0 // indirect
173176
golang.org/x/net v0.38.0 // indirect
174-
golang.org/x/oauth2 v0.26.0 // indirect
175-
golang.org/x/sync v0.12.0 // indirect
176-
golang.org/x/sys v0.31.0 // indirect
177+
golang.org/x/oauth2 v0.28.0 // indirect
178+
golang.org/x/sync v0.14.0 // indirect
179+
golang.org/x/sys v0.32.0 // indirect
177180
golang.org/x/term v0.30.0 // indirect
178181
golang.org/x/text v0.23.0 // indirect
179-
golang.org/x/time v0.10.0 // indirect
180-
golang.org/x/tools v0.30.0 // indirect
182+
golang.org/x/time v0.11.0 // indirect
183+
golang.org/x/tools v0.31.0 // indirect
181184
google.golang.org/api v0.215.0 // indirect
182185
google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 // indirect
183-
google.golang.org/genproto/googleapis/api v0.0.0-20241209162323-e6fa225c2576 // indirect
186+
google.golang.org/genproto/googleapis/api v0.0.0-20241219192143-6b3ec007d9bb // indirect
184187
google.golang.org/genproto/googleapis/rpc v0.0.0-20241223144023-3abc09e42ca8 // indirect
185-
google.golang.org/protobuf v1.36.5 // indirect
188+
google.golang.org/protobuf v1.36.6 // indirect
186189
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
187190
gopkg.in/inf.v0 v0.9.1 // indirect
188191
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
@@ -191,14 +194,15 @@ require (
191194
k8s.io/controller-manager v0.32.4 // indirect
192195
k8s.io/cri-api v0.32.4 // indirect
193196
k8s.io/csi-translation-lib v0.32.4 // indirect
194-
k8s.io/kms v0.32.4 // indirect
195-
k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect
197+
k8s.io/kms v0.32.5 // indirect
198+
k8s.io/kube-openapi v0.0.0-20250304201544-e5f78fe3ede9 // indirect
196199
k8s.io/kube-scheduler v0.32.4 // indirect
197200
k8s.io/kubelet v0.32.4 // indirect
198201
knative.dev/networking v0.0.0-20250117155906-67d1c274ba6a // indirect
199-
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect
200-
sigs.k8s.io/jobset v0.5.2 // indirect
202+
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.1 // indirect
203+
sigs.k8s.io/jobset v0.8.1 // indirect
201204
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
202-
sigs.k8s.io/structured-merge-diff/v4 v4.5.0 // indirect
205+
sigs.k8s.io/randfill v1.0.0 // indirect
206+
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
203207
sigs.k8s.io/yaml v1.4.0 // indirect
204208
)

0 commit comments

Comments
 (0)