Skip to content

Commit 8c6953b

Browse files
authored
Add The first e2e tests to the repo - better coverage will be added in the next PRS (#14)
1 parent cdadc3e commit 8c6953b

File tree

38 files changed

+2700
-7
lines changed

38 files changed

+2700
-7
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
bin/
22
.idea/
33
charts/
4-
cover.out
4+
cover.out
5+
.DS_Store

build/makefile/testenv.mk

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
ENVTEST_K8S_VERSION = 1.32.0
33
ENVTEST_VERSION=release-0.20
44

5+
E2E_TESTS_DIR = "test/e2e/"
6+
TEST_TARGETS = $(shell go list ./... | grep -v "${E2E_TESTS_DIR}")
7+
58
envtest-docker-go: gocache
69
@ ${ECHO_COMMAND} ${GREEN_CONSOLE} "${CONSOLE_PREFIX} Running unit-tests" ${BASE_CONSOLE}
710
${DOCKER_GO_COMMAND} make envtest-go || ${FAILURE_MESSAGE_HANDLER}
@@ -10,7 +13,7 @@ envtest-docker-go: gocache
1013
envtest-go: envtest
1114
@ ${ECHO_COMMAND} ${GREEN_CONSOLE} "${CONSOLE_PREFIX} Running unit-tests" ${BASE_CONSOLE}
1215
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path --bin-dir $(LOCALBIN))" \
13-
go test ./... -timeout 30m || ${FAILURE_MESSAGE_HANDLER}
16+
go test ${TEST_TARGETS} -timeout 30m || ${FAILURE_MESSAGE_HANDLER}
1417
${SUCCESS_MESSAGE_HANDLER}
1518

1619
ENVTEST = $(LOCALBIN)/setup-envtest

go.mod

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,20 @@ module github.com/NVIDIA/KAI-scheduler
33
go 1.23.4
44

55
require (
6+
github.com/NVIDIA/gpu-operator v1.8.3-0.20240812232433-87286e93f2c9
67
github.com/argoproj/argo-workflows/v3 v3.6.4
78
github.com/dustin/go-humanize v1.0.1
89
github.com/gin-contrib/pprof v1.5.2
910
github.com/gin-gonic/gin v1.10.0
11+
github.com/go-logr/logr v1.4.2
1012
github.com/golang/glog v1.2.4
1113
github.com/grafana/pyroscope-go v1.2.1
1214
github.com/onsi/ginkgo v1.16.5
1315
github.com/onsi/ginkgo/v2 v2.22.2
1416
github.com/onsi/gomega v1.36.2
1517
github.com/pkg/errors v0.9.1
1618
github.com/prometheus/client_golang v1.20.5
19+
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7
1720
github.com/spf13/pflag v1.0.6
1821
github.com/stretchr/testify v1.10.0
1922
github.com/xyproto/randomstring v1.2.0
@@ -51,6 +54,7 @@ require (
5154
k8s.io/pod-security-admission v0.32.1
5255
k8s.io/sample-apiserver v0.32.1
5356
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
57+
knative.dev/pkg v0.0.0-20250117084104-c43477f0052b
5458
knative.dev/serving v0.44.0
5559
sigs.k8s.io/controller-runtime v0.20.0
5660
sigs.k8s.io/karpenter v1.2.0
@@ -59,9 +63,11 @@ require (
5963
require (
6064
cel.dev/expr v0.18.0 // indirect
6165
github.com/Microsoft/go-winio v0.6.2 // indirect
66+
github.com/NVIDIA/k8s-kata-manager v0.2.0 // indirect
67+
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240627150410-078e3039ecf7 // indirect
6268
github.com/NYTimes/gziphandler v1.1.1 // indirect
6369
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
64-
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect
70+
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
6571
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 // indirect
6672
github.com/beorn7/perks v1.0.1 // indirect
6773
github.com/blang/semver/v4 v4.0.0 // indirect
@@ -83,7 +89,6 @@ require (
8389
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
8490
github.com/gabriel-vasile/mimetype v1.4.7 // indirect
8591
github.com/gin-contrib/sse v0.1.0 // indirect
86-
github.com/go-logr/logr v1.4.2 // indirect
8792
github.com/go-logr/stdr v1.2.2 // indirect
8893
github.com/go-logr/zapr v1.3.0 // indirect
8994
github.com/go-openapi/jsonpointer v0.21.0 // indirect
@@ -132,6 +137,7 @@ require (
132137
github.com/opencontainers/selinux v1.11.1 // indirect
133138
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
134139
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
140+
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.73.2 // indirect
135141
github.com/prometheus/client_model v0.6.1 // indirect
136142
github.com/prometheus/common v0.55.0 // indirect
137143
github.com/prometheus/procfs v0.15.1 // indirect
@@ -157,6 +163,7 @@ require (
157163
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
158164
golang.org/x/arch v0.12.0 // indirect
159165
golang.org/x/crypto v0.35.0 // indirect
166+
golang.org/x/mod v0.22.0 // indirect
160167
golang.org/x/net v0.36.0 // indirect
161168
golang.org/x/oauth2 v0.25.0 // indirect
162169
golang.org/x/sync v0.11.0 // indirect
@@ -183,7 +190,6 @@ require (
183190
k8s.io/kube-scheduler v0.32.1 // indirect
184191
k8s.io/kubelet v0.32.1 // indirect
185192
knative.dev/networking v0.0.0-20250117155906-67d1c274ba6a // indirect
186-
knative.dev/pkg v0.0.0-20250117084104-c43477f0052b // indirect
187193
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect
188194
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
189195
sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect

go.sum

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ contrib.go.opencensus.io/exporter/prometheus v0.4.2/go.mod h1:dvEHbiKmgvbr5pjaF9
99
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
1010
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
1111
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
12+
github.com/NVIDIA/gpu-operator v1.8.3-0.20240812232433-87286e93f2c9 h1:9yw3Jkto9ZqtNwlnOzxAlKufMNplaWPRfkXhCUFHXgI=
13+
github.com/NVIDIA/gpu-operator v1.8.3-0.20240812232433-87286e93f2c9/go.mod h1:lOgoRYbt1dtCVGX+EhxuZgPonfcIs41BXrnIPk3fE3I=
14+
github.com/NVIDIA/k8s-kata-manager v0.2.0 h1:K+BFkXTOvXXj/kmbNfxFCXM+GkdOVZj2WTHQ7b2uQA0=
15+
github.com/NVIDIA/k8s-kata-manager v0.2.0/go.mod h1:fVUz0DLzwW9RQBE59cLNTi3LrzVwSXWIogr9y5FocPM=
16+
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240627150410-078e3039ecf7 h1:NaClubDuTKoXy4Ev4ZkmpVy3u6xdwd3I1XFJdoI1r+M=
17+
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240627150410-078e3039ecf7/go.mod h1:d8YV6Am03Z9VS4fh6virN/ltOSasCZtAhkwMRU1X6Vs=
1218
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
1319
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
1420
github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg=
@@ -20,8 +26,8 @@ github.com/argoproj/argo-workflows/v3 v3.6.4 h1:5+Cc1UwaQE5ka3w7R3hxZ1TK3M6VjDEX
2026
github.com/argoproj/argo-workflows/v3 v3.6.4/go.mod h1:2f5zB8CkbNCCO1od+kd1dWkVokqcuyvu+tc+Jwx1MZg=
2127
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
2228
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
23-
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA=
24-
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
29+
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
30+
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
2531
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
2632
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
2733
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 h1:9nhjY3dzCpEmhpQ0vMlhB7wqucAiftLjAIEQu8uT2J4=
@@ -258,6 +264,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
258264
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
259265
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
260266
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
267+
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.73.2 h1:GwlGJPK6vf1UIohpc72KJVkKYlzki1UgE3xC4bWbf20=
268+
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.73.2/go.mod h1:yJ3CawR/A5qEYFEeCOUVYLTwYxmacfHQhJS+b/2QiaM=
261269
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
262270
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
263271
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@@ -274,6 +282,8 @@ github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzG
274282
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
275283
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
276284
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
285+
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7 h1:7sUlviMShcd8g6sf2Q93ix+geV0staOMk42Rs0rAJqA=
286+
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7/go.mod h1:vih5aAo7hS8Mt6NXsIUI8SYW+WfB4GhZOuLZzLjZWcc=
277287
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
278288
github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc=
279289
github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU=

hack/e2e-kind-config.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
kind: Cluster
5+
apiVersion: kind.x-k8s.io/v1alpha4
6+
nodes:
7+
- role: control-plane
8+
- role: worker
9+
labels:
10+
run.ai/simulated-gpu-node-pool: default
11+
- role: worker
12+
labels:
13+
run.ai/simulated-gpu-node-pool: default
14+
- role: worker
15+
labels:
16+
run.ai/simulated-gpu-node-pool: default

hack/run-e2e-kind.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
# Copyright 2025 NVIDIA CORPORATION
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
6+
CLUSTER_NAME=${CLUSTER_NAME:-e2e-kai-scheduler}
7+
8+
REPO_ROOT=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..
9+
KIND_CONFIG=${REPO_ROOT}/hack/e2e-kind-config.yaml
10+
GOPATH=${HOME}/go
11+
GOBIN=${GOPATH}/bin
12+
13+
NVCR_SECRET_FILE_PATH=${1}
14+
if [ -z "$NVCR_SECRET_FILE_PATH" ]; then
15+
echo "Must a path to an appropriate secret file, that contains the credentials for the nvstaging-runai helm and docker image repository"
16+
exit 1
17+
fi
18+
19+
kind create cluster --config ${KIND_CONFIG} --name $CLUSTER_NAME
20+
21+
kubectl create namespace kai-scheduler
22+
# Set an appropriate secret to allow the kube-ai system pods to pull from nvstaging-runai and pull test images for the e2e tests
23+
kubectl apply -f ${NVCR_SECRET_FILE_PATH} -n kai-scheduler
24+
25+
26+
# Install the fake-gpu-operator to provide a fake GPU resources for the e2e tests
27+
helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace --version 0.0.53 --set topology.nodePools.default.gpuCount=8
28+
29+
helm upgrade -i kai-scheduler nvstaging-runai/kai-scheduler -n kai-scheduler --create-namespace --set "global.imagePullSecrets[0].name=nvcr-secret" --set "global.gpuSharing=true" --set "global.registry=nvcr.io/nvstaging/runai" --version v0.2.0
30+
31+
# Allow all the pods in the fake-gpu-operator and kai-scheduler to start
32+
sleep 30
33+
34+
# Install ginkgo if it's not installed
35+
if [ ! -f ${GOBIN}/ginkgo ]; then
36+
echo "Installing ginkgo"
37+
GOBIN=${GOBIN} go install github.com/onsi/ginkgo/v2/[email protected]
38+
fi
39+
40+
${GOBIN}/ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv ${REPO_ROOT}/test/e2e/suites --label-filter '!autoscale', '!scale'
41+
42+
kind delete cluster --name $CLUSTER_NAME

pkg/common/constants/constants.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,9 @@ const (
3333
MigEnabledLabel = "node-role.kubernetes.io/runai-mig-enabled"
3434
MigStrategyLabel = "nvidia.com/mig.strategy"
3535
GpuCountLabel = "nvidia.com/gpu.count"
36+
QueueLabelKey = "runai/queue"
37+
38+
// Namespaces
39+
SystemPodsNamespace = "kai-scheduler"
40+
RunaiReservationNamespace = "runai-reservation"
3641
)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package gpu_operator_discovery
5+
6+
import (
7+
"context"
8+
"fmt"
9+
10+
nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
11+
kerrors "k8s.io/apimachinery/pkg/api/errors"
12+
"k8s.io/apimachinery/pkg/api/meta"
13+
"sigs.k8s.io/controller-runtime/pkg/client"
14+
logf "sigs.k8s.io/controller-runtime/pkg/log"
15+
)
16+
17+
func IsCdiEnabled(ctx context.Context, readerClient client.Reader) (bool, error) {
18+
nvidiaClusterPolicies := &nvidiav1.ClusterPolicyList{}
19+
err := readerClient.List(ctx, nvidiaClusterPolicies)
20+
if err != nil {
21+
if meta.IsNoMatchError(err) || kerrors.IsNotFound(err) {
22+
return false, nil
23+
}
24+
log := logf.FromContext(ctx)
25+
log.Error(err, "cannot list nvidia cluster policy")
26+
return false, err
27+
}
28+
29+
if len(nvidiaClusterPolicies.Items) == 0 {
30+
return false, nil
31+
}
32+
if len(nvidiaClusterPolicies.Items) > 1 {
33+
log := logf.FromContext(ctx)
34+
log.Info(fmt.Sprintf("Cluster has %d clusterpolicies.nvidia.com/v1 objects."+
35+
" First one is queried for the cdi configuration", len(nvidiaClusterPolicies.Items)))
36+
}
37+
38+
nvidiaClusterPolicy := nvidiaClusterPolicies.Items[0]
39+
if nvidiaClusterPolicy.Spec.CDI.Enabled != nil && *nvidiaClusterPolicy.Spec.CDI.Enabled {
40+
if nvidiaClusterPolicy.Spec.CDI.Default != nil && *nvidiaClusterPolicy.Spec.CDI.Default {
41+
return true, nil
42+
}
43+
}
44+
45+
return false, nil
46+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
Copyright 2025 NVIDIA CORPORATION
3+
SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package constant
7+
8+
const (
9+
NvidiaGPUMemoryLabelName = "nvidia.com/gpu.memory"
10+
NodeNamePodLabelName = "kubernetes.io/hostname"
11+
RunaiSchedulerName = "kai-scheduler"
12+
RunaiReservationNamespace = "runai-reservation"
13+
SystemPodsNamespace = "kai-scheduler"
14+
NonPreemptiblePriorityThreshold = 100
15+
EngineTestPodsApp = "engine-e2e"
16+
QueueLabelKey = "runai/queue"
17+
)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
/*
2+
Copyright 2025 NVIDIA CORPORATION
3+
SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package labels
6+
7+
const (
8+
ReservationPod = "reservationPod"
9+
)

0 commit comments

Comments
 (0)