Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
bin/
.idea/
charts/
cover.out
cover.out
.DS_Store
5 changes: 4 additions & 1 deletion build/makefile/testenv.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
ENVTEST_K8S_VERSION = 1.32.0
ENVTEST_VERSION=release-0.20

E2E_TESTS_DIR = "test/e2e/"
TEST_TARGETS = $(shell go list ./... | grep -v "${E2E_TESTS_DIR}")

envtest-docker-go: gocache
@ ${ECHO_COMMAND} ${GREEN_CONSOLE} "${CONSOLE_PREFIX} Running unit-tests" ${BASE_CONSOLE}
${DOCKER_GO_COMMAND} make envtest-go || ${FAILURE_MESSAGE_HANDLER}
Expand All @@ -10,7 +13,7 @@ envtest-docker-go: gocache
envtest-go: envtest
@ ${ECHO_COMMAND} ${GREEN_CONSOLE} "${CONSOLE_PREFIX} Running unit-tests" ${BASE_CONSOLE}
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path --bin-dir $(LOCALBIN))" \
go test ./... -timeout 30m || ${FAILURE_MESSAGE_HANDLER}
go test ${TEST_TARGETS} -timeout 30m || ${FAILURE_MESSAGE_HANDLER}
${SUCCESS_MESSAGE_HANDLER}

ENVTEST = $(LOCALBIN)/setup-envtest
Expand Down
12 changes: 9 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@ module github.com/NVIDIA/KAI-scheduler
go 1.23.4

require (
github.com/NVIDIA/gpu-operator v1.8.3-0.20240812232433-87286e93f2c9
github.com/argoproj/argo-workflows/v3 v3.6.4
github.com/dustin/go-humanize v1.0.1
github.com/gin-contrib/pprof v1.5.2
github.com/gin-gonic/gin v1.10.0
github.com/go-logr/logr v1.4.2
github.com/golang/glog v1.2.4
github.com/grafana/pyroscope-go v1.2.1
github.com/onsi/ginkgo v1.16.5
github.com/onsi/ginkgo/v2 v2.22.2
github.com/onsi/gomega v1.36.2
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.20.5
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7
github.com/spf13/pflag v1.0.6
github.com/stretchr/testify v1.10.0
github.com/xyproto/randomstring v1.2.0
Expand Down Expand Up @@ -51,6 +54,7 @@ require (
k8s.io/pod-security-admission v0.32.1
k8s.io/sample-apiserver v0.32.1
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
knative.dev/pkg v0.0.0-20250117084104-c43477f0052b
knative.dev/serving v0.44.0
sigs.k8s.io/controller-runtime v0.20.0
sigs.k8s.io/karpenter v1.2.0
Expand All @@ -59,9 +63,11 @@ require (
require (
cel.dev/expr v0.18.0 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/NVIDIA/k8s-kata-manager v0.2.0 // indirect
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240627150410-078e3039ecf7 // indirect
github.com/NYTimes/gziphandler v1.1.1 // indirect
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
Expand All @@ -83,7 +89,6 @@ require (
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.7 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
Expand Down Expand Up @@ -132,6 +137,7 @@ require (
github.com/opencontainers/selinux v1.11.1 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.73.2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
Expand All @@ -157,6 +163,7 @@ require (
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
golang.org/x/arch v0.12.0 // indirect
golang.org/x/crypto v0.35.0 // indirect
golang.org/x/mod v0.22.0 // indirect
golang.org/x/net v0.36.0 // indirect
golang.org/x/oauth2 v0.25.0 // indirect
golang.org/x/sync v0.11.0 // indirect
Expand All @@ -183,7 +190,6 @@ require (
k8s.io/kube-scheduler v0.32.1 // indirect
k8s.io/kubelet v0.32.1 // indirect
knative.dev/networking v0.0.0-20250117155906-67d1c274ba6a // indirect
knative.dev/pkg v0.0.0-20250117084104-c43477f0052b // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect
Expand Down
14 changes: 12 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ contrib.go.opencensus.io/exporter/prometheus v0.4.2/go.mod h1:dvEHbiKmgvbr5pjaF9
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/NVIDIA/gpu-operator v1.8.3-0.20240812232433-87286e93f2c9 h1:9yw3Jkto9ZqtNwlnOzxAlKufMNplaWPRfkXhCUFHXgI=
github.com/NVIDIA/gpu-operator v1.8.3-0.20240812232433-87286e93f2c9/go.mod h1:lOgoRYbt1dtCVGX+EhxuZgPonfcIs41BXrnIPk3fE3I=
github.com/NVIDIA/k8s-kata-manager v0.2.0 h1:K+BFkXTOvXXj/kmbNfxFCXM+GkdOVZj2WTHQ7b2uQA0=
github.com/NVIDIA/k8s-kata-manager v0.2.0/go.mod h1:fVUz0DLzwW9RQBE59cLNTi3LrzVwSXWIogr9y5FocPM=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240627150410-078e3039ecf7 h1:NaClubDuTKoXy4Ev4ZkmpVy3u6xdwd3I1XFJdoI1r+M=
github.com/NVIDIA/k8s-operator-libs v0.0.0-20240627150410-078e3039ecf7/go.mod h1:d8YV6Am03Z9VS4fh6virN/ltOSasCZtAhkwMRU1X6Vs=
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg=
Expand All @@ -20,8 +26,8 @@ github.com/argoproj/argo-workflows/v3 v3.6.4 h1:5+Cc1UwaQE5ka3w7R3hxZ1TK3M6VjDEX
github.com/argoproj/argo-workflows/v3 v3.6.4/go.mod h1:2f5zB8CkbNCCO1od+kd1dWkVokqcuyvu+tc+Jwx1MZg=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA=
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 h1:9nhjY3dzCpEmhpQ0vMlhB7wqucAiftLjAIEQu8uT2J4=
Expand Down Expand Up @@ -258,6 +264,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.73.2 h1:GwlGJPK6vf1UIohpc72KJVkKYlzki1UgE3xC4bWbf20=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.73.2/go.mod h1:yJ3CawR/A5qEYFEeCOUVYLTwYxmacfHQhJS+b/2QiaM=
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
Expand All @@ -274,6 +282,8 @@ github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzG
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7 h1:7sUlviMShcd8g6sf2Q93ix+geV0staOMk42Rs0rAJqA=
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7/go.mod h1:vih5aAo7hS8Mt6NXsIUI8SYW+WfB4GhZOuLZzLjZWcc=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc=
github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU=
Expand Down
16 changes: 16 additions & 0 deletions hack/e2e-kind-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
42 changes: 42 additions & 0 deletions hack/run-e2e-kind.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0


CLUSTER_NAME=${CLUSTER_NAME:-e2e-kai-scheduler}

REPO_ROOT=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..
KIND_CONFIG=${REPO_ROOT}/hack/e2e-kind-config.yaml
GOPATH=${HOME}/go
GOBIN=${GOPATH}/bin

NVCR_SECRET_FILE_PATH=${1}
if [ -z "$NVCR_SECRET_FILE_PATH" ]; then
echo "Must a path to an appropriate secret file, that contains the credentials for the nvstaging-runai helm and docker image repository"
exit 1
fi

kind create cluster --config ${KIND_CONFIG} --name $CLUSTER_NAME

kubectl create namespace kai-scheduler
# Set an appropriate secret to allow the kube-ai system pods to pull from nvstaging-runai and pull test images for the e2e tests
kubectl apply -f ${NVCR_SECRET_FILE_PATH} -n kai-scheduler


# Install the fake-gpu-operator to provide a fake GPU resources for the e2e tests
helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace --version 0.0.53 --set topology.nodePools.default.gpuCount=8

helm upgrade -i kai-scheduler nvstaging-runai/kai-scheduler -n kai-scheduler --create-namespace --set "global.imagePullSecrets[0].name=nvcr-secret" --set "global.gpuSharing=true" --set "global.registry=nvcr.io/nvstaging/runai" --version v0.2.0

# Allow all the pods in the fake-gpu-operator and kai-scheduler to start
sleep 30

# Install ginkgo if it's not installed
if [ ! -f ${GOBIN}/ginkgo ]; then
echo "Installing ginkgo"
GOBIN=${GOBIN} go install github.com/onsi/ginkgo/v2/[email protected]
fi

${GOBIN}/ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv ${REPO_ROOT}/test/e2e/suites --label-filter '!autoscale', '!scale'

kind delete cluster --name $CLUSTER_NAME
5 changes: 5 additions & 0 deletions pkg/common/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,9 @@ const (
MigEnabledLabel = "node-role.kubernetes.io/runai-mig-enabled"
MigStrategyLabel = "nvidia.com/mig.strategy"
GpuCountLabel = "nvidia.com/gpu.count"
QueueLabelKey = "runai/queue"

// Namespaces
SystemPodsNamespace = "kai-scheduler"
RunaiReservationNamespace = "runai-reservation"
)
46 changes: 46 additions & 0 deletions pkg/common/gpu_operator_discovery/cdi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright 2025 NVIDIA CORPORATION
// SPDX-License-Identifier: Apache-2.0

package gpu_operator_discovery

import (
"context"
"fmt"

nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
)

func IsCdiEnabled(ctx context.Context, readerClient client.Reader) (bool, error) {
nvidiaClusterPolicies := &nvidiav1.ClusterPolicyList{}
err := readerClient.List(ctx, nvidiaClusterPolicies)
if err != nil {
if meta.IsNoMatchError(err) || kerrors.IsNotFound(err) {
return false, nil
}
log := logf.FromContext(ctx)
log.Error(err, "cannot list nvidia cluster policy")
return false, err
}

if len(nvidiaClusterPolicies.Items) == 0 {
return false, nil
}
if len(nvidiaClusterPolicies.Items) > 1 {
log := logf.FromContext(ctx)
log.Info(fmt.Sprintf("Cluster has %d clusterpolicies.nvidia.com/v1 objects."+
" First one is queried for the cdi configuration", len(nvidiaClusterPolicies.Items)))
}

nvidiaClusterPolicy := nvidiaClusterPolicies.Items[0]
if nvidiaClusterPolicy.Spec.CDI.Enabled != nil && *nvidiaClusterPolicy.Spec.CDI.Enabled {
if nvidiaClusterPolicy.Spec.CDI.Default != nil && *nvidiaClusterPolicy.Spec.CDI.Default {
return true, nil
}
}

return false, nil
}
17 changes: 17 additions & 0 deletions test/e2e/modules/constant/constant.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
Copyright 2025 NVIDIA CORPORATION
SPDX-License-Identifier: Apache-2.0
*/

package constant

const (
NvidiaGPUMemoryLabelName = "nvidia.com/gpu.memory"
NodeNamePodLabelName = "kubernetes.io/hostname"
RunaiSchedulerName = "kai-scheduler"
RunaiReservationNamespace = "runai-reservation"
SystemPodsNamespace = "kai-scheduler"
NonPreemptiblePriorityThreshold = 100
EngineTestPodsApp = "engine-e2e"
QueueLabelKey = "runai/queue"
)
9 changes: 9 additions & 0 deletions test/e2e/modules/constant/labels/labels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/*
Copyright 2025 NVIDIA CORPORATION
SPDX-License-Identifier: Apache-2.0
*/
package labels

const (
ReservationPod = "reservationPod"
)
51 changes: 51 additions & 0 deletions test/e2e/modules/context/cluster.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
Copyright 2025 NVIDIA CORPORATION
SPDX-License-Identifier: Apache-2.0
*/
package context

import (
"context"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

v2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/resources/rd"
"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/resources/rd/queue"
)

const (
defaultServiceAccountName = "default"
)

func (tc *TestContext) createClusterQueues(ctx context.Context) error {
for _, testQueue := range tc.Queues {
err := createQueueContext(ctx, testQueue)
if err != nil {
return err
}
}
return nil
}

func createQueueContext(ctx context.Context, q *v2.Queue) error {
_, err := queue.Create(kubeAiSchedClientset, ctx, q, metav1.CreateOptions{})
if err != nil {
return err
}

namespaceName := queue.GetConnectedNamespaceToQueue(q)
ns := rd.CreateNamespaceObject(namespaceName, q.Name)
_, err = kubeClientset.
CoreV1().
Namespaces().
Create(ctx, ns, metav1.CreateOptions{})
if err != nil {
return err
}

// TODO: add RBAC role bindings
// TODO: patch the namespace to add appropriate secret to the service account

return nil
}
Loading