Skip to content

Commit 7d64c6a

Browse files
committed
chore: add another test
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 3a117fd commit 7d64c6a

File tree

1 file changed

+139
-24
lines changed

1 file changed

+139
-24
lines changed

health-monitors/kubernetes-object-monitor/pkg/controller/reconciler_test.go

Lines changed: 139 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -255,30 +255,7 @@ func TestReconciler_ErrorCodePropagation(t *testing.T) {
255255
}
256256

257257
func TestReconciler_CustomResource(t *testing.T) {
258-
crPolicy := config.Policy{
259-
Name: "gpu-job-failed",
260-
Enabled: true,
261-
Resource: config.ResourceSpec{
262-
Group: "batch.nvidia.com",
263-
Version: "v1alpha1",
264-
Kind: "GPUJob",
265-
},
266-
Predicate: config.PredicateSpec{
267-
Expression: `has(resource.status.state) && resource.status.state == "Failed"`,
268-
},
269-
NodeAssociation: &config.AssociationSpec{
270-
Expression: `resource.spec.nodeName`,
271-
},
272-
HealthEvent: config.HealthEventSpec{
273-
ComponentClass: "GPU",
274-
IsFatal: false,
275-
Message: "GPU job failed",
276-
RecommendedAction: "CONTACT_SUPPORT",
277-
ErrorCode: []string{"GPU_JOB_FAILED"},
278-
},
279-
}
280-
281-
setup := setupTestWithCRD(t, []config.Policy{crPolicy}, gpuJobCRD())
258+
setup := setupTestWithCRD(t, []config.Policy{defaultGPUJobFailedPolicy()}, gpuJobCRD())
282259
nodeName := "gpu-test-node"
283260
jobName := "test-gpu-job"
284261
namespace := "default"
@@ -351,6 +328,81 @@ func TestReconciler_CustomResource(t *testing.T) {
351328
}, time.Second, 50*time.Millisecond)
352329
}
353330

331+
func TestReconciler_CustomResourceColdStart(t *testing.T) {
332+
crPolicy := defaultGPUJobFailedPolicy()
333+
setup := setupTestWithCRD(t, []config.Policy{crPolicy}, gpuJobCRD())
334+
nodeName := "gpu-test-node-cold"
335+
jobName := "test-gpu-job-cold"
336+
namespace := "default"
337+
338+
createNode(t, setup, nodeName, v1.ConditionTrue)
339+
340+
gpuJob := &unstructured.Unstructured{
341+
Object: map[string]any{
342+
"apiVersion": "batch.nvidia.com/v1alpha1",
343+
"kind": "GPUJob",
344+
"metadata": map[string]any{
345+
"name": jobName,
346+
"namespace": namespace,
347+
},
348+
"spec": map[string]any{
349+
"nodeName": nodeName,
350+
},
351+
},
352+
}
353+
354+
require.NoError(t, setup.k8sClient.Create(setup.ctx, gpuJob))
355+
356+
require.Eventually(t, func() bool {
357+
err := setup.k8sClient.Get(setup.ctx, types.NamespacedName{
358+
Name: jobName,
359+
Namespace: namespace,
360+
}, gpuJob)
361+
return err == nil
362+
}, time.Second, 50*time.Millisecond)
363+
364+
gpuJob.Object["status"] = map[string]any{
365+
"state": "Failed",
366+
}
367+
require.NoError(t, setup.k8sClient.Status().Update(setup.ctx, gpuJob))
368+
369+
result, err := setup.reconciler.Reconcile(setup.ctx, ctrl.Request{
370+
NamespacedName: types.NamespacedName{Name: jobName, Namespace: namespace},
371+
})
372+
assert.NoError(t, err)
373+
assert.Equal(t, ctrl.Result{}, result)
374+
375+
require.Eventually(t, func() bool {
376+
if len(setup.publisher.publishedEvents) != 1 {
377+
return false
378+
}
379+
event := setup.publisher.publishedEvents[0]
380+
return event.nodeName == nodeName &&
381+
!event.isHealthy &&
382+
event.policy.Name == "gpu-job-failed"
383+
}, time.Second, 50*time.Millisecond)
384+
385+
coldStartSetup := restartReconcilerWithCRD(t, setup, []config.Policy{crPolicy}, gpuJobCRD())
386+
387+
require.NoError(t, coldStartSetup.k8sClient.Delete(coldStartSetup.ctx, gpuJob))
388+
389+
result, err = coldStartSetup.reconciler.Reconcile(coldStartSetup.ctx, ctrl.Request{
390+
NamespacedName: types.NamespacedName{Name: jobName, Namespace: namespace},
391+
})
392+
assert.NoError(t, err)
393+
assert.Equal(t, ctrl.Result{}, result)
394+
395+
require.Eventually(t, func() bool {
396+
if len(coldStartSetup.publisher.publishedEvents) != 1 {
397+
return false
398+
}
399+
event := coldStartSetup.publisher.publishedEvents[0]
400+
return event.nodeName == nodeName &&
401+
event.isHealthy &&
402+
event.policy.Name == "gpu-job-failed"
403+
}, time.Second, 50*time.Millisecond)
404+
}
405+
354406
func TestReconciler_ColdStart(t *testing.T) {
355407
tests := []struct {
356408
name string
@@ -457,6 +509,31 @@ func defaultNodeNotReadyPolicy() config.Policy {
457509
}
458510
}
459511

512+
func defaultGPUJobFailedPolicy() config.Policy {
513+
return config.Policy{
514+
Name: "gpu-job-failed",
515+
Enabled: true,
516+
Resource: config.ResourceSpec{
517+
Group: "batch.nvidia.com",
518+
Version: "v1alpha1",
519+
Kind: "GPUJob",
520+
},
521+
Predicate: config.PredicateSpec{
522+
Expression: `has(resource.status.state) && resource.status.state == "Failed"`,
523+
},
524+
NodeAssociation: &config.AssociationSpec{
525+
Expression: `resource.spec.nodeName`,
526+
},
527+
HealthEvent: config.HealthEventSpec{
528+
ComponentClass: "GPU",
529+
IsFatal: false,
530+
Message: "GPU job failed",
531+
RecommendedAction: "CONTACT_SUPPORT",
532+
ErrorCode: []string{"GPU_JOB_FAILED"},
533+
},
534+
}
535+
}
536+
460537
func setupTest(t *testing.T) *testSetup {
461538
t.Helper()
462539
return setupTestWithPolicies(t, []config.Policy{defaultNodeNotReadyPolicy()})
@@ -682,6 +759,44 @@ func restartReconciler(t *testing.T, setup *testSetup) *testSetup {
682759
}
683760
}
684761

762+
func restartReconcilerWithCRD(t *testing.T, setup *testSetup, policies []config.Policy, crd *apiextensionsv1.CustomResourceDefinition) *testSetup {
763+
t.Helper()
764+
765+
gvk := schema.GroupVersionKind{
766+
Group: crd.Spec.Group,
767+
Version: crd.Spec.Versions[0].Name,
768+
Kind: crd.Spec.Names.Kind,
769+
}
770+
771+
mockPub := &mockPublisher{
772+
publishedEvents: []mockPublishedEvent{},
773+
}
774+
775+
annotationMgr := annotations.NewManager(setup.k8sClient)
776+
777+
reconciler := controller.NewResourceReconciler(
778+
setup.k8sClient,
779+
setup.evaluator,
780+
mockPub,
781+
annotationMgr,
782+
policies,
783+
gvk,
784+
)
785+
786+
if err := reconciler.LoadState(setup.ctx); err != nil {
787+
t.Fatalf("Failed to load state after restart: %v", err)
788+
}
789+
790+
return &testSetup{
791+
ctx: setup.ctx,
792+
k8sClient: setup.k8sClient,
793+
reconciler: reconciler,
794+
publisher: mockPub,
795+
evaluator: setup.evaluator,
796+
testEnv: setup.testEnv,
797+
}
798+
}
799+
685800
func getCounterVecValue(t *testing.T, counterVec *prometheus.CounterVec, labelValues ...string) float64 {
686801
t.Helper()
687802
counter, err := counterVec.GetMetricWithLabelValues(labelValues...)

0 commit comments

Comments
 (0)