@@ -30,7 +30,9 @@ import (
3030 "github.com/stretchr/testify/assert"
3131 "github.com/stretchr/testify/require"
3232 v1 "k8s.io/api/core/v1"
33+ apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
3334 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
35+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3436 "k8s.io/apimachinery/pkg/runtime/schema"
3537 "k8s.io/apimachinery/pkg/types"
3638 ctrl "sigs.k8s.io/controller-runtime"
@@ -241,6 +243,83 @@ func TestReconciler_ErrorCodePropagation(t *testing.T) {
241243 }, time .Second , 50 * time .Millisecond )
242244}
243245
246+ func TestReconciler_CustomResource (t * testing.T ) {
247+ crPolicy := config.Policy {
248+ Name : "gpu-job-failed" ,
249+ Enabled : true ,
250+ Resource : config.ResourceSpec {
251+ Group : "batch.nvidia.com" ,
252+ Version : "v1alpha1" ,
253+ Kind : "GPUJob" ,
254+ },
255+ Predicate : config.PredicateSpec {
256+ Expression : `has(resource.status.state) && resource.status.state == "Failed"` ,
257+ },
258+ NodeAssociation : & config.AssociationSpec {
259+ Expression : `resource.spec.nodeName` ,
260+ },
261+ HealthEvent : config.HealthEventSpec {
262+ ComponentClass : "GPU" ,
263+ IsFatal : false ,
264+ Message : "GPU job failed" ,
265+ RecommendedAction : "CONTACT_SUPPORT" ,
266+ ErrorCode : []string {"GPU_JOB_FAILED" },
267+ },
268+ }
269+
270+ setup := setupTestWithCRD (t , []config.Policy {crPolicy }, gpuJobCRD ())
271+ nodeName := "gpu-test-node"
272+ jobName := "test-gpu-job"
273+ namespace := "default"
274+
275+ createNode (t , setup , nodeName , v1 .ConditionTrue )
276+
277+ gpuJob := & unstructured.Unstructured {
278+ Object : map [string ]any {
279+ "apiVersion" : "batch.nvidia.com/v1alpha1" ,
280+ "kind" : "GPUJob" ,
281+ "metadata" : map [string ]any {
282+ "name" : jobName ,
283+ "namespace" : namespace ,
284+ },
285+ "spec" : map [string ]any {
286+ "nodeName" : nodeName ,
287+ },
288+ },
289+ }
290+
291+ require .NoError (t , setup .k8sClient .Create (setup .ctx , gpuJob ))
292+
293+ require .Eventually (t , func () bool {
294+ err := setup .k8sClient .Get (setup .ctx , types.NamespacedName {
295+ Name : jobName ,
296+ Namespace : namespace ,
297+ }, gpuJob )
298+ return err == nil
299+ }, time .Second , 50 * time .Millisecond )
300+
301+ gpuJob .Object ["status" ] = map [string ]any {
302+ "state" : "Failed" ,
303+ }
304+ require .NoError (t , setup .k8sClient .Status ().Update (setup .ctx , gpuJob ))
305+
306+ result , err := setup .reconciler .Reconcile (setup .ctx , ctrl.Request {
307+ NamespacedName : types.NamespacedName {Name : jobName , Namespace : namespace },
308+ })
309+ assert .NoError (t , err )
310+ assert .Equal (t , ctrl.Result {}, result )
311+
312+ require .Eventually (t , func () bool {
313+ if len (setup .publisher .publishedEvents ) != 1 {
314+ return false
315+ }
316+ event := setup .publisher .publishedEvents [0 ]
317+ return event .nodeName == nodeName &&
318+ ! event .isHealthy &&
319+ event .policy .Name == "gpu-job-failed"
320+ }, time .Second , 50 * time .Millisecond )
321+ }
322+
244323func TestReconciler_ColdStart (t * testing.T ) {
245324 tests := []struct {
246325 name string
@@ -401,6 +480,58 @@ func setupTestWithPolicies(t *testing.T, policies []config.Policy) *testSetup {
401480 }
402481}
403482
483+ func setupTestWithCRD (t * testing.T , policies []config.Policy , crd * apiextensionsv1.CustomResourceDefinition ) * testSetup {
484+ t .Helper ()
485+
486+ ctx , cancel := context .WithTimeout (context .Background (), 30 * time .Second )
487+ t .Cleanup (cancel )
488+
489+ testEnv := & envtest.Environment {
490+ CRDs : []* apiextensionsv1.CustomResourceDefinition {crd },
491+ }
492+ cfg , err := testEnv .Start ()
493+ require .NoError (t , err )
494+ t .Cleanup (func () {
495+ assert .NoError (t , testEnv .Stop ())
496+ })
497+
498+ k8sClient , err := client .New (cfg , client.Options {})
499+ require .NoError (t , err )
500+
501+ mockPub := & mockPublisher {
502+ publishedEvents : []mockPublishedEvent {},
503+ }
504+
505+ celEnvironment , err := celenv .NewEnvironment (k8sClient )
506+ require .NoError (t , err )
507+
508+ evaluator , err := policy .NewEvaluator (celEnvironment , policies )
509+ require .NoError (t , err )
510+
511+ gvk := schema.GroupVersionKind {
512+ Group : crd .Spec .Group ,
513+ Version : crd .Spec .Versions [0 ].Name ,
514+ Kind : crd .Spec .Names .Kind ,
515+ }
516+
517+ reconciler := controller .NewResourceReconciler (
518+ k8sClient ,
519+ evaluator ,
520+ mockPub ,
521+ policies ,
522+ gvk ,
523+ )
524+
525+ return & testSetup {
526+ ctx : ctx ,
527+ k8sClient : k8sClient ,
528+ reconciler : reconciler ,
529+ publisher : mockPub ,
530+ evaluator : evaluator ,
531+ testEnv : testEnv ,
532+ }
533+ }
534+
404535type mockPublishedEvent struct {
405536 ctx context.Context
406537 policy * config.Policy
@@ -515,3 +646,50 @@ func getCounterVecValue(t *testing.T, counterVec *prometheus.CounterVec, labelVa
515646 require .NoError (t , err )
516647 return metric .Counter .GetValue ()
517648}
649+
650+ func gpuJobCRD () * apiextensionsv1.CustomResourceDefinition {
651+ return & apiextensionsv1.CustomResourceDefinition {
652+ ObjectMeta : metav1.ObjectMeta {
653+ Name : "gpujobs.batch.nvidia.com" ,
654+ },
655+ Spec : apiextensionsv1.CustomResourceDefinitionSpec {
656+ Group : "batch.nvidia.com" ,
657+ Names : apiextensionsv1.CustomResourceDefinitionNames {
658+ Plural : "gpujobs" ,
659+ Singular : "gpujob" ,
660+ Kind : "GPUJob" ,
661+ ListKind : "GPUJobList" ,
662+ },
663+ Scope : apiextensionsv1 .NamespaceScoped ,
664+ Versions : []apiextensionsv1.CustomResourceDefinitionVersion {
665+ {
666+ Name : "v1alpha1" ,
667+ Served : true ,
668+ Storage : true ,
669+ Schema : & apiextensionsv1.CustomResourceValidation {
670+ OpenAPIV3Schema : & apiextensionsv1.JSONSchemaProps {
671+ Type : "object" ,
672+ Properties : map [string ]apiextensionsv1.JSONSchemaProps {
673+ "spec" : {
674+ Type : "object" ,
675+ Properties : map [string ]apiextensionsv1.JSONSchemaProps {
676+ "nodeName" : {Type : "string" },
677+ },
678+ },
679+ "status" : {
680+ Type : "object" ,
681+ Properties : map [string ]apiextensionsv1.JSONSchemaProps {
682+ "state" : {Type : "string" },
683+ },
684+ },
685+ },
686+ },
687+ },
688+ Subresources : & apiextensionsv1.CustomResourceSubresources {
689+ Status : & apiextensionsv1.CustomResourceSubresourceStatus {},
690+ },
691+ },
692+ },
693+ },
694+ }
695+ }
0 commit comments