Skip to content

Commit de52e86

Browse files
committed
chore: address review coments
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 45f77cb commit de52e86

File tree

9 files changed

+192
-51
lines changed

9 files changed

+192
-51
lines changed

health-monitors/kubernetes-object-monitor/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ require (
66
github.com/google/cel-go v0.26.1
77
github.com/nvidia/nvsentinel/commons v0.0.0
88
github.com/nvidia/nvsentinel/data-models v0.0.0
9-
github.com/pelletier/go-toml/v2 v2.2.4
109
github.com/prometheus/client_golang v1.23.2
1110
github.com/prometheus/client_model v0.6.2
1211
github.com/stretchr/testify v1.11.1
@@ -19,6 +18,7 @@ require (
1918

2019
require (
2120
cel.dev/expr v0.25.1 // indirect
21+
github.com/BurntSushi/toml v1.5.0 // indirect
2222
github.com/antlr4-go/antlr/v4 v4.13.1 // indirect
2323
github.com/beorn7/perks v1.0.1 // indirect
2424
github.com/cespare/xxhash/v2 v2.3.0 // indirect

health-monitors/kubernetes-object-monitor/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4=
22
cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4=
3+
github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg=
4+
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
35
github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ=
46
github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw=
57
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -102,8 +104,6 @@ github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg
102104
github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
103105
github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw=
104106
github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
105-
github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
106-
github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
107107
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
108108
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
109109
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

health-monitors/kubernetes-object-monitor/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ var (
6060
)
6161
maxConcurrentReconciles = flag.Int(
6262
"max-concurrent-reconciles",
63-
10,
63+
1,
6464
"Maximum number of resources to reconcile concurrently",
6565
)
6666
platformConnectorSocket = flag.String(

health-monitors/kubernetes-object-monitor/pkg/cel/environment.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,11 @@ func (e *Environment) lookup(args ...ref.Val) ref.Val {
141141
}
142142

143143
if err := e.client.Get(ctx, key, obj); err != nil {
144-
slog.Error("Failed to get object from informer cache", "error", err)
144+
slog.Error("Failed to get object using cached client", "error", err)
145145
return types.NullValue
146146
}
147147

148-
slog.Info("Successfully got object from informer cache", "object", obj.Object)
148+
slog.Info("Successfully got object using cached client", "object", obj.Object)
149149

150150
return types.DefaultTypeAdapter.NativeToValue(obj.Object)
151151
}

health-monitors/kubernetes-object-monitor/pkg/config/loader.go

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,14 @@ package config
1515

1616
import (
1717
"fmt"
18-
"os"
1918

20-
"github.com/pelletier/go-toml/v2"
19+
"github.com/nvidia/nvsentinel/commons/pkg/configmanager"
2120
)
2221

2322
func Load(path string) (*Config, error) {
24-
data, err := os.ReadFile(path)
25-
if err != nil {
26-
return nil, fmt.Errorf("failed to read config file: %w", err)
27-
}
28-
2923
var cfg Config
30-
if err := toml.Unmarshal(data, &cfg); err != nil {
31-
return nil, fmt.Errorf("failed to parse TOML config: %w", err)
24+
if err := configmanager.LoadTOMLConfig(path, &cfg); err != nil {
25+
return nil, err
3226
}
3327

3428
if err := validate(&cfg); err != nil {

health-monitors/kubernetes-object-monitor/pkg/config/types.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,11 @@ type AssociationSpec struct {
4141
}
4242

4343
type HealthEventSpec struct {
44-
ComponentClass string `toml:"componentClass"`
45-
IsFatal bool `toml:"isFatal"`
46-
Message string `toml:"message"`
47-
RecommendedAction string `toml:"recommendedAction"`
44+
ComponentClass string `toml:"componentClass"`
45+
IsFatal bool `toml:"isFatal"`
46+
Message string `toml:"message"`
47+
RecommendedAction string `toml:"recommendedAction"`
48+
ErrorCode []string `toml:"errorCode"`
4849
}
4950

5051
func (r *ResourceSpec) GVK() string {

health-monitors/kubernetes-object-monitor/pkg/controller/reconciler.go

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func (r *ResourceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
6464
obj.SetGroupVersionKind(r.gvk)
6565

6666
if err := r.Get(ctx, req.NamespacedName, obj); err != nil {
67-
return r.handleGetError(err, req)
67+
return r.handleGetError(ctx, err, req)
6868
}
6969

7070
for _, p := range r.policies {
@@ -81,21 +81,18 @@ func (r *ResourceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
8181
return ctrl.Result{}, nil
8282
}
8383

84-
func (r *ResourceReconciler) handleGetError(err error, req ctrl.Request) (ctrl.Result, error) {
84+
func (r *ResourceReconciler) handleGetError(ctx context.Context, err error, req ctrl.Request) (ctrl.Result, error) {
8585
if client.IgnoreNotFound(err) == nil {
86-
r.cleanupDeletedResource(req)
86+
r.cleanupDeletedResource(ctx, req)
8787
return ctrl.Result{}, nil
8888
}
8989

90-
metrics.ReconciliationErrors.WithLabelValues(r.gvk.Kind, "get_error").Inc()
90+
metrics.ReconciliationErrors.WithLabelValues(r.gvk.Kind, "get_resource_error").Inc()
9191

92-
return ctrl.Result{}, err
92+
return ctrl.Result{}, fmt.Errorf("failed to get resource: %w", err)
9393
}
9494

95-
func (r *ResourceReconciler) cleanupDeletedResource(req ctrl.Request) {
96-
r.matchStatesMu.Lock()
97-
defer r.matchStatesMu.Unlock()
98-
95+
func (r *ResourceReconciler) cleanupDeletedResource(ctx context.Context, req ctrl.Request) {
9996
for _, p := range r.policies {
10097
if !p.Enabled {
10198
continue
@@ -107,8 +104,25 @@ func (r *ResourceReconciler) cleanupDeletedResource(req ctrl.Request) {
107104
obj.SetName(req.Name)
108105

109106
stateKey := r.getStateKey(&p, obj)
110-
if r.matchStates[stateKey] {
107+
108+
r.matchStatesMu.RLock()
109+
wasMatched := r.matchStates[stateKey]
110+
r.matchStatesMu.RUnlock()
111+
112+
if wasMatched {
113+
nodeName := obj.GetName()
114+
115+
if err := r.publisher.PublishHealthEvent(ctx, &p, nodeName, true); err != nil {
116+
slog.Error("Failed to publish healthy event for deleted resource",
117+
"policy", p.Name,
118+
"resource", req.NamespacedName,
119+
"error", err)
120+
metrics.HealthEventsPublishErrors.WithLabelValues(p.Name, "grpc_error").Inc()
121+
}
122+
123+
r.matchStatesMu.Lock()
111124
delete(r.matchStates, stateKey)
125+
r.matchStatesMu.Unlock()
112126
}
113127
}
114128
}

health-monitors/kubernetes-object-monitor/pkg/controller/reconciler_test.go

Lines changed: 153 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package controller_test
1616

1717
import (
1818
"context"
19+
"strings"
1920
"testing"
2021
"time"
2122

@@ -217,6 +218,103 @@ func TestReconciler_DisabledPolicy(t *testing.T) {
217218
}, 500*time.Millisecond, 50*time.Millisecond)
218219
}
219220

221+
func TestReconciler_ErrorCodePropagation(t *testing.T) {
222+
setup := setupTest(t)
223+
nodeName := "test-node-error-code"
224+
225+
createNode(t, setup, nodeName, v1.ConditionFalse)
226+
227+
result, err := setup.reconciler.Reconcile(setup.ctx, ctrl.Request{
228+
NamespacedName: types.NamespacedName{Name: nodeName},
229+
})
230+
assert.NoError(t, err)
231+
assert.Equal(t, ctrl.Result{}, result)
232+
233+
require.Eventually(t, func() bool {
234+
if len(setup.publisher.publishedEvents) != 1 {
235+
return false
236+
}
237+
event := setup.publisher.publishedEvents[0]
238+
return len(event.policy.HealthEvent.ErrorCode) == 2 &&
239+
event.policy.HealthEvent.ErrorCode[0] == "NODE_NOT_READY" &&
240+
event.policy.HealthEvent.ErrorCode[1] == "CONDITION_FAILED"
241+
}, time.Second, 50*time.Millisecond)
242+
}
243+
244+
func TestReconciler_ColdStart(t *testing.T) {
245+
tests := []struct {
246+
name string
247+
postRestartAction func(*testing.T, *testSetup, string, *v1.Node)
248+
expectEvent bool
249+
expectHealthy bool
250+
}{
251+
{
252+
name: "unhealthy resource",
253+
postRestartAction: func(t *testing.T, s *testSetup, nodeName string, _ *v1.Node) {
254+
updateNodeStatus(t, s, nodeName, v1.ConditionFalse)
255+
},
256+
expectEvent: true,
257+
expectHealthy: false,
258+
},
259+
{
260+
name: "healthy resource",
261+
postRestartAction: func(t *testing.T, s *testSetup, nodeName string, _ *v1.Node) {
262+
updateNodeStatus(t, s, nodeName, v1.ConditionTrue)
263+
},
264+
expectEvent: false,
265+
},
266+
{
267+
name: "deleted resource",
268+
postRestartAction: func(t *testing.T, s *testSetup, _ string, node *v1.Node) {
269+
require.NoError(t, s.k8sClient.Delete(s.ctx, node))
270+
},
271+
expectEvent: false,
272+
},
273+
}
274+
275+
for _, tt := range tests {
276+
t.Run(tt.name, func(t *testing.T) {
277+
setup := setupTest(t)
278+
nodeName := "cold-start-" + strings.ReplaceAll(tt.name, " ", "-")
279+
280+
node := createNode(t, setup, nodeName, v1.ConditionFalse)
281+
282+
result, err := setup.reconciler.Reconcile(setup.ctx, ctrl.Request{
283+
NamespacedName: types.NamespacedName{Name: nodeName},
284+
})
285+
assert.NoError(t, err)
286+
assert.Equal(t, ctrl.Result{}, result)
287+
288+
require.Eventually(t, func() bool {
289+
return len(setup.publisher.publishedEvents) == 1
290+
}, time.Second, 50*time.Millisecond)
291+
292+
coldStartSetup := restartReconciler(t, setup)
293+
294+
tt.postRestartAction(t, coldStartSetup, nodeName, node)
295+
296+
result, err = coldStartSetup.reconciler.Reconcile(coldStartSetup.ctx, ctrl.Request{
297+
NamespacedName: types.NamespacedName{Name: nodeName},
298+
})
299+
assert.NoError(t, err)
300+
assert.Equal(t, ctrl.Result{}, result)
301+
302+
if tt.expectEvent {
303+
require.Eventually(t, func() bool {
304+
if len(coldStartSetup.publisher.publishedEvents) != 1 {
305+
return false
306+
}
307+
return coldStartSetup.publisher.publishedEvents[0].isHealthy == tt.expectHealthy
308+
}, time.Second, 50*time.Millisecond)
309+
} else {
310+
require.Never(t, func() bool {
311+
return len(coldStartSetup.publisher.publishedEvents) > 0
312+
}, 500*time.Millisecond, 50*time.Millisecond)
313+
}
314+
})
315+
}
316+
}
317+
220318
type testSetup struct {
221319
ctx context.Context
222320
k8sClient client.Client
@@ -226,31 +324,31 @@ type testSetup struct {
226324
testEnv *envtest.Environment
227325
}
228326

229-
func setupTest(t *testing.T) *testSetup {
230-
t.Helper()
231-
232-
policies := []config.Policy{
233-
{
234-
Name: "node-not-ready",
235-
Enabled: true,
236-
Resource: config.ResourceSpec{
237-
Group: "",
238-
Version: "v1",
239-
Kind: "Node",
240-
},
241-
Predicate: config.PredicateSpec{
242-
Expression: `resource.status.conditions.filter(c, c.type == "Ready" && c.status == "False").size() > 0`,
243-
},
244-
HealthEvent: config.HealthEventSpec{
245-
ComponentClass: "Node",
246-
IsFatal: true,
247-
Message: "Node is not ready",
248-
RecommendedAction: "CONTACT_SUPPORT",
249-
},
327+
func defaultNodeNotReadyPolicy() config.Policy {
328+
return config.Policy{
329+
Name: "node-not-ready",
330+
Enabled: true,
331+
Resource: config.ResourceSpec{
332+
Group: "",
333+
Version: "v1",
334+
Kind: "Node",
335+
},
336+
Predicate: config.PredicateSpec{
337+
Expression: `resource.status.conditions.filter(c, c.type == "Ready" && c.status == "False").size() > 0`,
338+
},
339+
HealthEvent: config.HealthEventSpec{
340+
ComponentClass: "Node",
341+
IsFatal: true,
342+
Message: "Node is not ready",
343+
RecommendedAction: "CONTACT_SUPPORT",
344+
ErrorCode: []string{"NODE_NOT_READY", "CONDITION_FAILED"},
250345
},
251346
}
347+
}
252348

253-
return setupTestWithPolicies(t, policies)
349+
func setupTest(t *testing.T) *testSetup {
350+
t.Helper()
351+
return setupTestWithPolicies(t, []config.Policy{defaultNodeNotReadyPolicy()})
254352
}
255353

256354
func setupTestWithPolicies(t *testing.T, policies []config.Policy) *testSetup {
@@ -375,6 +473,39 @@ func updateNodeStatus(t *testing.T, setup *testSetup, name string, readyStatus v
375473
}, time.Second, 50*time.Millisecond)
376474
}
377475

476+
func restartReconciler(t *testing.T, setup *testSetup) *testSetup {
477+
t.Helper()
478+
479+
gvk := schema.GroupVersionKind{
480+
Group: "",
481+
Version: "v1",
482+
Kind: "Node",
483+
}
484+
485+
mockPub := &mockPublisher{
486+
publishedEvents: []mockPublishedEvent{},
487+
}
488+
489+
policies := []config.Policy{defaultNodeNotReadyPolicy()}
490+
491+
reconciler := controller.NewResourceReconciler(
492+
setup.k8sClient,
493+
setup.evaluator,
494+
mockPub,
495+
policies,
496+
gvk,
497+
)
498+
499+
return &testSetup{
500+
ctx: setup.ctx,
501+
k8sClient: setup.k8sClient,
502+
reconciler: reconciler,
503+
publisher: mockPub,
504+
evaluator: setup.evaluator,
505+
testEnv: setup.testEnv,
506+
}
507+
}
508+
378509
func getCounterVecValue(t *testing.T, counterVec *prometheus.CounterVec, labelValues ...string) float64 {
379510
t.Helper()
380511
counter, err := counterVec.GetMetricWithLabelValues(labelValues...)

health-monitors/kubernetes-object-monitor/pkg/publisher/publisher.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ func (p *Publisher) PublishHealthEvent(ctx context.Context,
5555
IsHealthy: isHealthy,
5656
NodeName: nodeName,
5757
RecommendedAction: mapRecommendedAction(policy.HealthEvent.RecommendedAction),
58+
ErrorCode: policy.HealthEvent.ErrorCode,
5859
}
5960

6061
healthEvents := &pb.HealthEvents{

0 commit comments

Comments
 (0)