@@ -255,30 +255,7 @@ func TestReconciler_ErrorCodePropagation(t *testing.T) {
255255}
256256
257257func TestReconciler_CustomResource (t * testing.T ) {
258- crPolicy := config.Policy {
259- Name : "gpu-job-failed" ,
260- Enabled : true ,
261- Resource : config.ResourceSpec {
262- Group : "batch.nvidia.com" ,
263- Version : "v1alpha1" ,
264- Kind : "GPUJob" ,
265- },
266- Predicate : config.PredicateSpec {
267- Expression : `has(resource.status.state) && resource.status.state == "Failed"` ,
268- },
269- NodeAssociation : & config.AssociationSpec {
270- Expression : `resource.spec.nodeName` ,
271- },
272- HealthEvent : config.HealthEventSpec {
273- ComponentClass : "GPU" ,
274- IsFatal : false ,
275- Message : "GPU job failed" ,
276- RecommendedAction : "CONTACT_SUPPORT" ,
277- ErrorCode : []string {"GPU_JOB_FAILED" },
278- },
279- }
280-
281- setup := setupTestWithCRD (t , []config.Policy {crPolicy }, gpuJobCRD ())
258+ setup := setupTestWithCRD (t , []config.Policy {defaultGPUJobFailedPolicy ()}, gpuJobCRD ())
282259 nodeName := "gpu-test-node"
283260 jobName := "test-gpu-job"
284261 namespace := "default"
@@ -351,6 +328,81 @@ func TestReconciler_CustomResource(t *testing.T) {
351328 }, time .Second , 50 * time .Millisecond )
352329}
353330
331+ func TestReconciler_CustomResourceColdStart (t * testing.T ) {
332+ crPolicy := defaultGPUJobFailedPolicy ()
333+ setup := setupTestWithCRD (t , []config.Policy {crPolicy }, gpuJobCRD ())
334+ nodeName := "gpu-test-node-cold"
335+ jobName := "test-gpu-job-cold"
336+ namespace := "default"
337+
338+ createNode (t , setup , nodeName , v1 .ConditionTrue )
339+
340+ gpuJob := & unstructured.Unstructured {
341+ Object : map [string ]any {
342+ "apiVersion" : "batch.nvidia.com/v1alpha1" ,
343+ "kind" : "GPUJob" ,
344+ "metadata" : map [string ]any {
345+ "name" : jobName ,
346+ "namespace" : namespace ,
347+ },
348+ "spec" : map [string ]any {
349+ "nodeName" : nodeName ,
350+ },
351+ },
352+ }
353+
354+ require .NoError (t , setup .k8sClient .Create (setup .ctx , gpuJob ))
355+
356+ require .Eventually (t , func () bool {
357+ err := setup .k8sClient .Get (setup .ctx , types.NamespacedName {
358+ Name : jobName ,
359+ Namespace : namespace ,
360+ }, gpuJob )
361+ return err == nil
362+ }, time .Second , 50 * time .Millisecond )
363+
364+ gpuJob .Object ["status" ] = map [string ]any {
365+ "state" : "Failed" ,
366+ }
367+ require .NoError (t , setup .k8sClient .Status ().Update (setup .ctx , gpuJob ))
368+
369+ result , err := setup .reconciler .Reconcile (setup .ctx , ctrl.Request {
370+ NamespacedName : types.NamespacedName {Name : jobName , Namespace : namespace },
371+ })
372+ assert .NoError (t , err )
373+ assert .Equal (t , ctrl.Result {}, result )
374+
375+ require .Eventually (t , func () bool {
376+ if len (setup .publisher .publishedEvents ) != 1 {
377+ return false
378+ }
379+ event := setup .publisher .publishedEvents [0 ]
380+ return event .nodeName == nodeName &&
381+ ! event .isHealthy &&
382+ event .policy .Name == "gpu-job-failed"
383+ }, time .Second , 50 * time .Millisecond )
384+
385+ coldStartSetup := restartReconcilerWithCRD (t , setup , []config.Policy {crPolicy }, gpuJobCRD ())
386+
387+ require .NoError (t , coldStartSetup .k8sClient .Delete (coldStartSetup .ctx , gpuJob ))
388+
389+ result , err = coldStartSetup .reconciler .Reconcile (coldStartSetup .ctx , ctrl.Request {
390+ NamespacedName : types.NamespacedName {Name : jobName , Namespace : namespace },
391+ })
392+ assert .NoError (t , err )
393+ assert .Equal (t , ctrl.Result {}, result )
394+
395+ require .Eventually (t , func () bool {
396+ if len (coldStartSetup .publisher .publishedEvents ) != 1 {
397+ return false
398+ }
399+ event := coldStartSetup .publisher .publishedEvents [0 ]
400+ return event .nodeName == nodeName &&
401+ event .isHealthy &&
402+ event .policy .Name == "gpu-job-failed"
403+ }, time .Second , 50 * time .Millisecond )
404+ }
405+
354406func TestReconciler_ColdStart (t * testing.T ) {
355407 tests := []struct {
356408 name string
@@ -457,6 +509,31 @@ func defaultNodeNotReadyPolicy() config.Policy {
457509 }
458510}
459511
512+ func defaultGPUJobFailedPolicy () config.Policy {
513+ return config.Policy {
514+ Name : "gpu-job-failed" ,
515+ Enabled : true ,
516+ Resource : config.ResourceSpec {
517+ Group : "batch.nvidia.com" ,
518+ Version : "v1alpha1" ,
519+ Kind : "GPUJob" ,
520+ },
521+ Predicate : config.PredicateSpec {
522+ Expression : `has(resource.status.state) && resource.status.state == "Failed"` ,
523+ },
524+ NodeAssociation : & config.AssociationSpec {
525+ Expression : `resource.spec.nodeName` ,
526+ },
527+ HealthEvent : config.HealthEventSpec {
528+ ComponentClass : "GPU" ,
529+ IsFatal : false ,
530+ Message : "GPU job failed" ,
531+ RecommendedAction : "CONTACT_SUPPORT" ,
532+ ErrorCode : []string {"GPU_JOB_FAILED" },
533+ },
534+ }
535+ }
536+
460537func setupTest (t * testing.T ) * testSetup {
461538 t .Helper ()
462539 return setupTestWithPolicies (t , []config.Policy {defaultNodeNotReadyPolicy ()})
@@ -682,6 +759,44 @@ func restartReconciler(t *testing.T, setup *testSetup) *testSetup {
682759 }
683760}
684761
762+ func restartReconcilerWithCRD (t * testing.T , setup * testSetup , policies []config.Policy , crd * apiextensionsv1.CustomResourceDefinition ) * testSetup {
763+ t .Helper ()
764+
765+ gvk := schema.GroupVersionKind {
766+ Group : crd .Spec .Group ,
767+ Version : crd .Spec .Versions [0 ].Name ,
768+ Kind : crd .Spec .Names .Kind ,
769+ }
770+
771+ mockPub := & mockPublisher {
772+ publishedEvents : []mockPublishedEvent {},
773+ }
774+
775+ annotationMgr := annotations .NewManager (setup .k8sClient )
776+
777+ reconciler := controller .NewResourceReconciler (
778+ setup .k8sClient ,
779+ setup .evaluator ,
780+ mockPub ,
781+ annotationMgr ,
782+ policies ,
783+ gvk ,
784+ )
785+
786+ if err := reconciler .LoadState (setup .ctx ); err != nil {
787+ t .Fatalf ("Failed to load state after restart: %v" , err )
788+ }
789+
790+ return & testSetup {
791+ ctx : setup .ctx ,
792+ k8sClient : setup .k8sClient ,
793+ reconciler : reconciler ,
794+ publisher : mockPub ,
795+ evaluator : setup .evaluator ,
796+ testEnv : setup .testEnv ,
797+ }
798+ }
799+
685800func getCounterVecValue (t * testing.T , counterVec * prometheus.CounterVec , labelValues ... string ) float64 {
686801 t .Helper ()
687802 counter , err := counterVec .GetMetricWithLabelValues (labelValues ... )
0 commit comments