Skip to content

Commit b0a5185

Browse files
committed
fix broken things 2
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent 528deac commit b0a5185

File tree

12 files changed

+369
-41
lines changed

12 files changed

+369
-41
lines changed

health-events-analyzer/pkg/reconciler/reconciler.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,16 @@ func (r *Reconciler) getPipelineStages(
284284
rule config.HealthEventsAnalyzerRule,
285285
healthEventWithStatus datamodels.HealthEventWithStatus,
286286
) ([]interface{}, error) {
287-
pipeline := make([]interface{}, 0, len(rule.Stage))
287+
// CRITICAL: Always start with agent filter to exclude events from health-events-analyzer itself
288+
// This prevents the analyzer from matching its own generated events, which would cause
289+
// infinite loops and incorrect rule evaluations
290+
pipeline := []interface{}{
291+
map[string]interface{}{
292+
"$match": map[string]interface{}{
293+
"healthevent.agent": map[string]interface{}{"$ne": "health-events-analyzer"},
294+
},
295+
},
296+
}
288297

289298
for i, stageStr := range rule.Stage {
290299
// Parse the stage and resolve "this." references
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package reconciler
16+
17+
import (
18+
"testing"
19+
20+
datamodels "github.com/nvidia/nvsentinel/data-models/pkg/model"
21+
protos "github.com/nvidia/nvsentinel/data-models/pkg/protos"
22+
config "github.com/nvidia/nvsentinel/health-events-analyzer/pkg/config"
23+
"github.com/stretchr/testify/assert"
24+
"github.com/stretchr/testify/require"
25+
)
26+
27+
// TestGetPipelineStages_AlwaysIncludesAgentFilter verifies that the agent filter
28+
// is ALWAYS the first stage in the pipeline to prevent the health-events-analyzer
29+
// from matching its own generated events, which would cause infinite loops and
30+
// incorrect rule evaluations.
31+
//
32+
// This is a regression test for the bug introduced in commit 7be1a34 where the
33+
// agent filter was accidentally removed during refactoring.
34+
func TestGetPipelineStages_AlwaysIncludesAgentFilter(t *testing.T) {
35+
reconciler := &Reconciler{
36+
config: HealthEventsAnalyzerReconcilerConfig{},
37+
}
38+
39+
testCases := []struct {
40+
name string
41+
rule config.HealthEventsAnalyzerRule
42+
event datamodels.HealthEventWithStatus
43+
description string
44+
}{
45+
{
46+
name: "rule with multiple stages",
47+
rule: config.HealthEventsAnalyzerRule{
48+
Name: "multi-stage-rule",
49+
Stage: []string{
50+
`{"$match": {"healthevent.nodename": "this.healthevent.nodename"}}`,
51+
`{"$count": "total"}`,
52+
`{"$match": {"total": {"$gte": 5}}}`,
53+
},
54+
},
55+
event: datamodels.HealthEventWithStatus{
56+
HealthEvent: &protos.HealthEvent{
57+
NodeName: "test-node",
58+
Agent: "gpu-health-monitor",
59+
},
60+
},
61+
description: "Multi-stage rule should have agent filter as first stage",
62+
},
63+
{
64+
name: "rule with single stage",
65+
rule: config.HealthEventsAnalyzerRule{
66+
Name: "single-stage-rule",
67+
Stage: []string{
68+
`{"$match": {"healthevent.isfatal": true}}`,
69+
},
70+
},
71+
event: datamodels.HealthEventWithStatus{
72+
HealthEvent: &protos.HealthEvent{
73+
NodeName: "test-node",
74+
Agent: "syslog-monitor",
75+
},
76+
},
77+
description: "Single-stage rule should have agent filter as first stage",
78+
},
79+
{
80+
name: "rule with empty stages",
81+
rule: config.HealthEventsAnalyzerRule{
82+
Name: "empty-rule",
83+
Stage: []string{},
84+
},
85+
event: datamodels.HealthEventWithStatus{
86+
HealthEvent: &protos.HealthEvent{
87+
NodeName: "test-node",
88+
Agent: "gpu-health-monitor",
89+
},
90+
},
91+
description: "Even with empty stages, agent filter should be present",
92+
},
93+
{
94+
name: "rule with complex MongoDB operators",
95+
rule: config.HealthEventsAnalyzerRule{
96+
Name: "complex-operator-rule",
97+
Stage: []string{
98+
`{"$match": {"$expr": {"$gte": ["$healthevent.generatedtimestamp.seconds", {"$subtract": [{"$divide": [{"$toLong": "$$NOW"}, 1000]}, 180]}]}}}`,
99+
`{"$match": {"healthevent.nodename": "this.healthevent.nodename"}}`,
100+
},
101+
},
102+
event: datamodels.HealthEventWithStatus{
103+
HealthEvent: &protos.HealthEvent{
104+
NodeName: "operator-node",
105+
Agent: "platform-connector",
106+
},
107+
},
108+
description: "Complex operator rule should have agent filter as first stage",
109+
},
110+
}
111+
112+
for _, tc := range testCases {
113+
t.Run(tc.name, func(t *testing.T) {
114+
pipeline, err := reconciler.getPipelineStages(tc.rule, tc.event)
115+
require.NoError(t, err, "getPipelineStages should not return an error")
116+
117+
// Verify pipeline has at least the agent filter stage
118+
expectedMinLength := 1 + len(tc.rule.Stage)
119+
assert.GreaterOrEqual(t, len(pipeline), 1, tc.description)
120+
121+
// CRITICAL CHECK: First stage must be the agent filter
122+
firstStage, ok := pipeline[0].(map[string]interface{})
123+
require.True(t, ok, "First stage should be a map")
124+
125+
matchStage, ok := firstStage["$match"].(map[string]interface{})
126+
require.True(t, ok, "First stage should be a $match stage")
127+
128+
// Verify the agent filter exists and excludes "health-events-analyzer"
129+
agentFilter, ok := matchStage["healthevent.agent"].(map[string]interface{})
130+
require.True(t, ok, "Agent filter must be present in first $match stage")
131+
132+
excludeValue, ok := agentFilter["$ne"].(string)
133+
require.True(t, ok, "Agent filter must use $ne operator")
134+
assert.Equal(t, "health-events-analyzer", excludeValue,
135+
"Agent filter must exclude 'health-events-analyzer' to prevent infinite loops")
136+
137+
// Verify the configured stages come after the agent filter
138+
if len(tc.rule.Stage) > 0 {
139+
assert.Equal(t, expectedMinLength, len(pipeline),
140+
"Pipeline should have agent filter + configured stages")
141+
}
142+
})
143+
}
144+
}
145+
146+
// TestGetPipelineStages_AgentFilterPreventsInfiniteLoop tests that events
147+
// generated by health-events-analyzer itself would be excluded by the filter.
148+
func TestGetPipelineStages_AgentFilterPreventsInfiniteLoop(t *testing.T) {
149+
reconciler := &Reconciler{
150+
config: HealthEventsAnalyzerReconcilerConfig{},
151+
}
152+
153+
rule := config.HealthEventsAnalyzerRule{
154+
Name: "test-rule",
155+
Stage: []string{
156+
`{"$match": {"healthevent.nodename": "this.healthevent.nodename"}}`,
157+
},
158+
}
159+
160+
// Create an event that was generated BY health-events-analyzer
161+
// This simulates the case where the analyzer publishes an event that
162+
// would match its own rules if the agent filter is not present
163+
eventFromAnalyzer := datamodels.HealthEventWithStatus{
164+
HealthEvent: &protos.HealthEvent{
165+
NodeName: "test-node",
166+
Agent: "health-events-analyzer", // This is the critical field
167+
CheckName: "RepeatedXidError",
168+
IsFatal: true,
169+
},
170+
}
171+
172+
pipeline, err := reconciler.getPipelineStages(rule, eventFromAnalyzer)
173+
require.NoError(t, err)
174+
175+
// Extract the agent filter from the first stage
176+
firstStage := pipeline[0].(map[string]interface{})
177+
matchStage := firstStage["$match"].(map[string]interface{})
178+
agentFilter := matchStage["healthevent.agent"].(map[string]interface{})
179+
excludeValue := agentFilter["$ne"].(string)
180+
181+
// CRITICAL: Verify that the event's agent matches what we're excluding
182+
assert.Equal(t, eventFromAnalyzer.HealthEvent.Agent, excludeValue,
183+
"The agent filter must exclude events from health-events-analyzer itself")
184+
185+
// In a real MongoDB aggregation pipeline, this $match stage would filter out
186+
// any documents where healthevent.agent == "health-events-analyzer"
187+
// This prevents the analyzer from processing its own generated events
188+
}
189+
190+
// TestGetPipelineStages_AgentFilterPosition verifies that the agent filter
191+
// is always the FIRST stage, before any user-configured stages.
192+
func TestGetPipelineStages_AgentFilterPosition(t *testing.T) {
193+
reconciler := &Reconciler{
194+
config: HealthEventsAnalyzerReconcilerConfig{},
195+
}
196+
197+
rule := config.HealthEventsAnalyzerRule{
198+
Name: "positioned-rule",
199+
Stage: []string{
200+
`{"$match": {"healthevent.isfatal": true}}`,
201+
`{"$match": {"healthevent.nodename": "specific-node"}}`,
202+
`{"$count": "total"}`,
203+
},
204+
}
205+
206+
event := datamodels.HealthEventWithStatus{
207+
HealthEvent: &protos.HealthEvent{
208+
NodeName: "test-node",
209+
Agent: "gpu-health-monitor",
210+
},
211+
}
212+
213+
pipeline, err := reconciler.getPipelineStages(rule, event)
214+
require.NoError(t, err)
215+
216+
// Pipeline should have: 1 agent filter + 3 configured stages = 4 total
217+
assert.Equal(t, 4, len(pipeline), "Pipeline should have agent filter + 3 configured stages")
218+
219+
// Verify first stage is agent filter
220+
firstStage := pipeline[0].(map[string]interface{})
221+
matchStage := firstStage["$match"].(map[string]interface{})
222+
_, hasAgentFilter := matchStage["healthevent.agent"]
223+
assert.True(t, hasAgentFilter, "First stage must be the agent filter")
224+
225+
// Verify second stage is the first configured stage
226+
secondStage := pipeline[1].(map[string]interface{})
227+
secondMatch := secondStage["$match"].(map[string]interface{})
228+
isFatal, hasIsFatal := secondMatch["healthevent.isfatal"]
229+
assert.True(t, hasIsFatal, "Second stage should be first configured stage")
230+
assert.Equal(t, true, isFatal, "Second stage should match isfatal: true")
231+
232+
// Verify third stage is the second configured stage
233+
thirdStage := pipeline[2].(map[string]interface{})
234+
thirdMatch := thirdStage["$match"].(map[string]interface{})
235+
nodeName, hasNodeName := thirdMatch["healthevent.nodename"]
236+
assert.True(t, hasNodeName, "Third stage should be second configured stage")
237+
assert.Equal(t, "specific-node", nodeName, "Third stage should match specific-node")
238+
239+
// Verify fourth stage is the count stage
240+
fourthStage := pipeline[3].(map[string]interface{})
241+
countField, hasCount := fourthStage["$count"]
242+
assert.True(t, hasCount, "Fourth stage should be the $count stage")
243+
assert.Equal(t, "total", countField, "Count field should be 'total'")
244+
}

health-events-analyzer/pkg/reconciler/reconciler_test.go

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -456,19 +456,28 @@ func TestGetPipelineStages(t *testing.T) {
456456

457457
pipeline, err := reconciler.getPipelineStages(rule, event)
458458
assert.NoError(t, err)
459-
assert.Len(t, pipeline, 2)
459+
// Agent filter (1) + configured stages (2) = 3 total
460+
assert.Len(t, pipeline, 3)
460461

461-
// Check first stage
462+
// Check first stage is agent filter
462463
firstStage, ok := pipeline[0].(map[string]interface{})
463464
assert.True(t, ok)
464-
matchStage, ok := firstStage["$match"].(map[string]interface{})
465+
agentMatch, ok := firstStage["$match"].(map[string]interface{})
465466
assert.True(t, ok)
466-
assert.Equal(t, "test-node-1", matchStage["healthevent.nodename"])
467+
_, hasAgentFilter := agentMatch["healthevent.agent"]
468+
assert.True(t, hasAgentFilter, "First stage must be agent filter")
467469

468-
// Check second stage
470+
// Check second stage (first configured stage)
469471
secondStage, ok := pipeline[1].(map[string]interface{})
470472
assert.True(t, ok)
471-
assert.Equal(t, "total", secondStage["$count"])
473+
matchStage, ok := secondStage["$match"].(map[string]interface{})
474+
assert.True(t, ok)
475+
assert.Equal(t, "test-node-1", matchStage["healthevent.nodename"])
476+
477+
// Check third stage (second configured stage)
478+
thirdStage, ok := pipeline[2].(map[string]interface{})
479+
assert.True(t, ok)
480+
assert.Equal(t, "total", thirdStage["$count"])
472481
})
473482

474483
t.Run("complex stages with nested this references", func(t *testing.T) {
@@ -491,11 +500,12 @@ func TestGetPipelineStages(t *testing.T) {
491500

492501
pipeline, err := reconciler.getPipelineStages(rule, event)
493502
assert.NoError(t, err)
494-
assert.Len(t, pipeline, 4)
503+
// Agent filter (1) + configured stages (4) = 5 total
504+
assert.Len(t, pipeline, 5)
495505

496-
// Check that this references are resolved in first stage
497-
firstStage := pipeline[0].(map[string]interface{})
498-
matchStage := firstStage["$match"].(map[string]interface{})
506+
// Check that this references are resolved in second stage (first configured stage)
507+
secondStage := pipeline[1].(map[string]interface{})
508+
matchStage := secondStage["$match"].(map[string]interface{})
499509
assert.Equal(t, "gpu-node-2", matchStage["healthevent.nodename"])
500510
assert.Equal(t, "13", matchStage["healthevent.errorcode.0"])
501511
})
@@ -517,10 +527,11 @@ func TestGetPipelineStages(t *testing.T) {
517527

518528
pipeline, err := reconciler.getPipelineStages(rule, event)
519529
assert.NoError(t, err)
520-
assert.Len(t, pipeline, 1)
530+
// Agent filter (1) + configured stages (1) = 2 total
531+
assert.Len(t, pipeline, 2)
521532

522-
firstStage := pipeline[0].(map[string]interface{})
523-
matchStage := firstStage["$match"].(map[string]interface{})
533+
secondStage := pipeline[1].(map[string]interface{})
534+
matchStage := secondStage["$match"].(map[string]interface{})
524535
assert.Equal(t, "GPU-123", matchStage["healthevent.entitiesimpacted.0.entityvalue"])
525536
})
526537

@@ -574,7 +585,8 @@ func TestGetPipelineStages(t *testing.T) {
574585

575586
pipeline, err := reconciler.getPipelineStages(rule, event)
576587
assert.NoError(t, err)
577-
assert.Len(t, pipeline, 0)
588+
// Even with empty stages, agent filter is always present
589+
assert.Len(t, pipeline, 1)
578590
})
579591

580592
t.Run("stages without any this references", func(t *testing.T) {
@@ -593,11 +605,12 @@ func TestGetPipelineStages(t *testing.T) {
593605

594606
pipeline, err := reconciler.getPipelineStages(rule, event)
595607
assert.NoError(t, err)
596-
assert.Len(t, pipeline, 2)
608+
// Agent filter (1) + configured stages (2) = 3 total
609+
assert.Len(t, pipeline, 3)
597610

598-
// Verify stages remain unchanged
599-
firstStage := pipeline[0].(map[string]interface{})
600-
matchStage := firstStage["$match"].(map[string]interface{})
611+
// Verify configured stages remain unchanged (second stage is first configured)
612+
secondStage := pipeline[1].(map[string]interface{})
613+
matchStage := secondStage["$match"].(map[string]interface{})
601614
assert.Equal(t, true, matchStage["healthevent.isfatal"])
602615
})
603616

@@ -616,10 +629,11 @@ func TestGetPipelineStages(t *testing.T) {
616629

617630
pipeline, err := reconciler.getPipelineStages(rule, event)
618631
assert.NoError(t, err)
619-
assert.Len(t, pipeline, 1)
632+
// Agent filter (1) + configured stages (1) = 2 total
633+
assert.Len(t, pipeline, 2)
620634

621-
firstStage := pipeline[0].(map[string]interface{})
622-
matchStage := firstStage["$match"].(map[string]interface{})
635+
secondStage := pipeline[1].(map[string]interface{})
636+
matchStage := secondStage["$match"].(map[string]interface{})
623637
assert.Equal(t, "operator-node", matchStage["healthevent.nodename"])
624638

625639
// Verify $expr is preserved

node-drainer/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ require (
1313
github.com/prometheus/client_golang v1.23.2
1414
github.com/prometheus/client_model v0.6.2
1515
github.com/stretchr/testify v1.11.1
16-
golang.org/x/sync v0.17.0
16+
golang.org/x/sync v0.18.0
1717
k8s.io/api v0.34.1
1818
k8s.io/apimachinery v0.34.1
1919
k8s.io/client-go v0.34.1

0 commit comments

Comments
 (0)