Skip to content

Commit 1cb59f6

Browse files
committed
chore: add tilt test
1 parent 52a6b9e commit 1cb59f6

File tree

8 files changed

+333
-55
lines changed

8 files changed

+333
-55
lines changed

distros/kubernetes/nvsentinel/charts/health-events-analyzer/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,5 +61,5 @@ config: |
6161
recommended_action = "CONTACT_SUPPORT"
6262
6363
[[rules.sequence]]
64-
criteria = { "healthevent.ishealthy" = false, "healthevent.entitiesimpacted.0.entitytype" = "GPU", "healthevent.entitiesimpacted.0.entityvalue" = "this.healthevent.entitiesimpacted.0.entityvalue", "healthevent.errorcode.0" = "this.healthevent.errorcode.0", "healthevent.nodename" = "this.healthevent.nodename"}
64+
criteria = { "healthevent.ishealthy" = false, "healthevent.entitiesimpacted.0.entitytype" = "GPU", "healthevent.entitiesimpacted.0.entityvalue" = "this.healthevent.entitiesimpacted.0.entityvalue", "healthevent.nodename" = "this.healthevent.nodename"}
6565
errorCount = 5

distros/kubernetes/nvsentinel/charts/mongodb-store/templates/jobs.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,11 @@ spec:
139139
db.$MONGODB_MAINTENANCE_EVENT_COLLECTION_NAME.createIndex(
140140
{ 'cspStatus': 1 },
141141
);
142-
db.$MONGODB_COLLECTION_NAME.createIndex(
143-
{'healthevent.nodename': 1},
144-
{'healthevent.entitiesimpacted.0.entityvalue': 1},
145-
{'healthevent.generatedtimestamp.seconds': 1}
146-
)
142+
db.$MONGODB_COLLECTION_NAME.createIndex({
143+
'healthevent.nodename': 1,
144+
'healthevent.entitiesimpacted.0.entityvalue': 1,
145+
'healthevent.generatedtimestamp.seconds': 1
146+
})
147147
// Check if user exists before creating
148148
var userExists = db.getSiblingDB('\$external').getUser('$MONGODB_APPLICATION_USER_DN');
149149
if (userExists) {

health-events-analyzer/pkg/reconciler/reconciler.go

Lines changed: 57 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ func getFacet(facetName string, timeWindow time.Duration, matchCriteria bson.D)
329329
{Key: "healthevent.generatedtimestamp.seconds", Value: bson.D{
330330
{Key: "$gte", Value: time.Now().UTC().Add(-timeWindow).Unix()},
331331
}},
332-
{Key: "healthevent.checkname", Value: bson.D{{Key: "$ne", Value: "HealthEventsAnalyzer"}}},
332+
{Key: "healthevent.agent", Value: bson.D{{Key: "$ne", Value: "health-events-analyzer"}}},
333333
}}},
334334
bson.D{{Key: "$match", Value: matchCriteria}},
335335
bson.D{{Key: "$count", Value: "count"}},
@@ -350,7 +350,7 @@ func getFacetWithBurstDetection(facetName string, timeWindow time.Duration,
350350
{Key: "healthevent.generatedtimestamp.seconds", Value: bson.D{
351351
{Key: "$gte", Value: time.Now().UTC().Add(-timeWindow).Unix()},
352352
}},
353-
{Key: "healthevent.checkname", Value: bson.D{{Key: "$ne", Value: "HealthEventsAnalyzer"}}},
353+
{Key: "healthevent.agent", Value: bson.D{{Key: "$ne", Value: "health-events-analyzer"}}},
354354
}}},
355355

356356
// Stage 2: Match node, GPU criteria (but NOT XID - we need all XIDs for gap detection)
@@ -376,71 +376,83 @@ func getFacetWithBurstDetection(facetName string, timeWindow time.Duration,
376376
}},
377377
}}},
378378

379-
// Stage 5: Calculate gap and mark new bursts
380-
bson.D{{Key: "$addFields", Value: bson.D{
381-
{Key: "timeSincePrev", Value: bson.D{
382-
{Key: "$cond", Value: bson.D{
383-
{Key: "if", Value: bson.D{{Key: "$eq", Value: bson.A{"$prevTimestamp", nil}}}},
384-
{Key: "then", Value: 0},
385-
{Key: "else", Value: bson.D{
386-
{Key: "$subtract", Value: bson.A{
387-
"$healthevent.generatedtimestamp.seconds",
388-
"$prevTimestamp",
389-
}},
390-
}},
391-
}},
392-
}},
393-
{Key: "isNewBurst", Value: bson.D{
394-
{Key: "$cond", Value: bson.D{
395-
{Key: "if", Value: bson.D{{Key: "$eq", Value: bson.A{"$prevTimestamp", nil}}}}, // First event
396-
{Key: "then", Value: 1},
397-
{Key: "else", Value: bson.D{
398-
{Key: "$cond", Value: bson.D{
399-
{Key: "if", Value: bson.D{
400-
{Key: "$gt", Value: bson.A{
401-
bson.D{{Key: "$subtract", Value: bson.A{
402-
"$healthevent.generatedtimestamp.seconds",
403-
"$prevTimestamp",
404-
}}},
405-
burstGapSeconds, // Gap threshold in seconds
406-
}},
407-
}},
408-
{Key: "then", Value: 1}, // New burst
409-
{Key: "else", Value: 0}, // Same burst continues
410-
}},
411-
}},
412-
}},
413-
}},
414-
}}},
415-
416-
// Stage 6: Assign burst IDs using cumulative sum - CRITICAL window bounds
379+
// Stage 5: Assign burst IDs using cumulative sum with inline burst detection
417380
bson.D{{Key: "$setWindowFields", Value: bson.D{
418381
{Key: "sortBy", Value: bson.D{
419382
{Key: "healthevent.generatedtimestamp.seconds", Value: 1},
420383
}},
421384
{Key: "output", Value: bson.D{
422385
{Key: "burstId", Value: bson.D{
423-
{Key: "$sum", Value: "$isNewBurst"},
386+
{Key: "$sum", Value: bson.D{
387+
{Key: "$cond", Value: bson.D{
388+
{Key: "if", Value: bson.D{{Key: "$eq", Value: bson.A{"$prevTimestamp", nil}}}}, // First event
389+
{Key: "then", Value: 1},
390+
{Key: "else", Value: bson.D{
391+
{Key: "$cond", Value: bson.D{
392+
{Key: "if", Value: bson.D{
393+
{Key: "$gt", Value: bson.A{
394+
bson.D{{Key: "$subtract", Value: bson.A{
395+
"$healthevent.generatedtimestamp.seconds",
396+
"$prevTimestamp",
397+
}}},
398+
burstGapSeconds, // Gap threshold in seconds
399+
}},
400+
}},
401+
{Key: "then", Value: 1}, // New burst
402+
{Key: "else", Value: 0}, // Same burst continues
403+
}},
404+
}},
405+
}},
406+
}},
424407
{Key: "window", Value: bson.D{
425408
{Key: "documents", Value: bson.A{"unbounded", "current"}}, // Running sum - CRITICAL
426409
}},
427410
}},
428411
}},
429412
}}},
430413

431-
// Stage 7: Group by burst and collect XIDs
414+
// Stage 6: Group by burst and collect XIDs + count target XID
432415
bson.D{{Key: "$group", Value: bson.D{
433416
{Key: "_id", Value: bson.D{
434417
{Key: "burstId", Value: "$burstId"},
435418
}},
436-
{Key: "uniqueXidsInBurst", Value: bson.D{{Key: "$push", Value: bson.D{
419+
{Key: "uniqueXidsInBurst", Value: bson.D{{Key: "$addToSet", Value: bson.D{
437420
{Key: "$arrayElemAt", Value: bson.A{"$healthevent.errorcode", 0}},
438421
}}}},
422+
{Key: "targetXidCount", Value: bson.D{{Key: "$sum", Value: bson.D{
423+
{Key: "$cond", Value: bson.A{
424+
bson.D{{Key: "$eq", Value: bson.A{
425+
bson.D{{Key: "$arrayElemAt", Value: bson.A{"$healthevent.errorcode", 0}}},
426+
targetXID,
427+
}}},
428+
1,
429+
0,
430+
}},
431+
}}}},
439432
}}},
440433

441-
// Stage 8: Filter to only bursts containing the target XID
434+
// Stage 7: Mark the last/most recent burst using window function
435+
bson.D{{Key: "$setWindowFields", Value: bson.D{
436+
{Key: "sortBy", Value: bson.D{{Key: "_id.burstId", Value: 1}}},
437+
{Key: "output", Value: bson.D{
438+
{Key: "maxBurstId", Value: bson.D{{Key: "$max", Value: "$_id.burstId"}}},
439+
}},
440+
}}},
441+
442+
// Stage 8: Filter bursts
443+
// - Old bursts: include if they contain target XID
444+
// - Last burst: include only if targetXidCount == 1 (prevents duplicate publications)
442445
bson.D{{Key: "$match", Value: bson.D{
443-
{Key: "uniqueXidsInBurst", Value: targetXID},
446+
{Key: "$expr", Value: bson.D{
447+
{Key: "$and", Value: bson.A{
448+
// Must contain target XID
449+
bson.D{{Key: "$in", Value: bson.A{targetXID, "$uniqueXidsInBurst"}}},
450+
bson.D{{Key: "$or", Value: bson.A{
451+
bson.D{{Key: "$ne", Value: bson.A{"$_id.burstId", "$maxBurstId"}}},
452+
bson.D{{Key: "$eq", Value: bson.A{"$targetXidCount", 1}}},
453+
}}},
454+
}},
455+
}},
444456
}}},
445457

446458
// Stage 9: Count distinct bursts and collect burst details

health-events-analyzer/pkg/reconciler/reconciler_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ var (
100100
"healthevent.entitiesimpacted.0.entityvalue": "this.healthevent.entitiesimpacted[0].entityvalue",
101101
"healthevent.errorcode.0": "13",
102102
"healthevent.nodename": "this.healthevent.nodename",
103-
"healthevent.checkname": "{\"$ne\": \"HealthEventsAnalyzer\"}",
104103
},
105104
ErrorCount: 3,
106105
}},
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: health-events-analyzer-config
5+
namespace: nvsentinel
6+
data:
7+
config.toml: |
8+
[[rules]]
9+
name = "MultipleFatalError"
10+
description = "Detect occurrence of multiple faults within 7 days"
11+
time_window = "168h"
12+
recommended_action = "CONTACT_SUPPORT"
13+
14+
[[rules.sequence]]
15+
criteria = {"healtheventstatus.faultremediated" = true, "healthevent.nodename" = "this.healthevent.nodename", "healthevent.isfatal" = "this.healthevent.isfatal"}
16+
errorCount = 5
17+
18+
[[rules]]
19+
name = "RepeatedXidError"
20+
description = "Detect occurrence of fatal XIDs 5 times within 24 hours"
21+
time_window = "24h"
22+
burst_gap_seconds = 10
23+
recommended_action = "CONTACT_SUPPORT"
24+
25+
[[rules.sequence]]
26+
criteria = { "healthevent.ishealthy" = false, "healthevent.entitiesimpacted.0.entitytype" = "GPU", "healthevent.entitiesimpacted.0.entityvalue" = "this.healthevent.entitiesimpacted.0.entityvalue", "healthevent.nodename" = "this.healthevent.nodename", "healthevent.checkname" = "this.healthevent.checkname"}
27+
errorCount = 2
28+

tests/helpers/kube.go

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"context"
1919
"errors"
2020
"fmt"
21+
"os"
2122
"strings"
2223
"sync"
2324
"testing"
@@ -34,11 +35,13 @@ import (
3435
"k8s.io/apimachinery/pkg/runtime/schema"
3536
"sigs.k8s.io/e2e-framework/klient"
3637
"sigs.k8s.io/e2e-framework/klient/k8s/resources"
38+
"sigs.k8s.io/yaml"
3739
)
3840

3941
const (
4042
WaitTimeout = 10 * time.Minute
4143
WaitInterval = 10 * time.Second
44+
Namespace = "nvsentinel"
4245
)
4346

4447
// WaitForNodesCordonState waits for nodes with names specified in `nodeNames` to be either cordoned or uncrodoned based on `shouldCordon`. If `shouldCordon` is
@@ -613,7 +616,7 @@ func CreateRebootNodeCR(
613616
}
614617

615618
// WaitForNodeConditionWithCheckName waits for the node to have a condition with the reason containing the specified checkName.
616-
func WaitForNodeConditionWithCheckName(ctx context.Context, t *testing.T, c klient.Client, nodeName, checkName string) {
619+
func WaitForNodeConditionWithCheckName(ctx context.Context, t *testing.T, c klient.Client, nodeName, checkName string, expectedMessage string) {
617620
require.Eventually(t, func() bool {
618621
node, err := GetNodeByName(ctx, c, nodeName)
619622
if err != nil {
@@ -623,7 +626,7 @@ func WaitForNodeConditionWithCheckName(ctx context.Context, t *testing.T, c klie
623626

624627
// Look for a condition where the reason contains the check name
625628
for _, condition := range node.Status.Conditions {
626-
if condition.Status == v1.ConditionTrue && strings.Contains(condition.Reason, checkName) {
629+
if condition.Status == v1.ConditionTrue && strings.Contains(condition.Reason, checkName) && strings.Contains(condition.Message, expectedMessage) {
627630
t.Logf("Found node condition: Type=%s, Reason=%s, Status=%s, Message=%s",
628631
condition.Type, condition.Reason, condition.Status, condition.Message)
629632
return true
@@ -659,3 +662,80 @@ func EnsureNodeConditionNotPresent(ctx context.Context, t *testing.T, c klient.C
659662
return false
660663
}, waitTimeout, WaitInterval, "node %s should NOT have a condition with check name %s", nodeName, checkName)
661664
}
665+
666+
// UpdateHealthEventsAnalyzerConfig applies a ConfigMap YAML file and restarts the health-events-analyzer pod
667+
func UpdateHealthEventsAnalyzerConfig(ctx context.Context, t *testing.T, c klient.Client, configFilePath string, podPrefix string) error {
668+
// Read and apply the config file
669+
configData, err := os.ReadFile(configFilePath)
670+
if err != nil {
671+
return fmt.Errorf("failed to read config file %s: %w", configFilePath, err)
672+
}
673+
674+
// Decode YAML into ConfigMap
675+
var cm v1.ConfigMap
676+
err = yaml.Unmarshal(configData, &cm)
677+
if err != nil {
678+
return fmt.Errorf("failed to decode config file: %w", err)
679+
}
680+
681+
// Apply (update or create)
682+
err = c.Resources().Update(ctx, &cm)
683+
if err != nil {
684+
err = c.Resources(Namespace).Create(ctx, &cm)
685+
if err != nil {
686+
return fmt.Errorf("failed to apply config: %w", err)
687+
}
688+
}
689+
690+
t.Logf("%s config successfully applied", configFilePath)
691+
692+
deleteThePod(ctx, t, c, podPrefix, Namespace)
693+
694+
// Wait for new pod to be ready
695+
waitForPodRestart(ctx, t, c, podPrefix, Namespace)
696+
return nil
697+
}
698+
699+
func deleteThePod(ctx context.Context, t *testing.T, c klient.Client, podPrefix, namespace string) error {
700+
// Restart by deleting the pod (find by name prefix)
701+
var pods v1.PodList
702+
err := c.Resources(namespace).List(ctx, &pods)
703+
if err != nil {
704+
return fmt.Errorf("failed to list pods: %w", err)
705+
}
706+
707+
for _, pod := range pods.Items {
708+
if strings.HasPrefix(pod.Name, podPrefix) {
709+
err = c.Resources(namespace).Delete(ctx, &pod)
710+
if err != nil {
711+
return fmt.Errorf("failed to delete pod: %w", err)
712+
}
713+
t.Logf("Deleted pod %s to trigger restart", pod.Name)
714+
}
715+
}
716+
717+
return nil
718+
}
719+
720+
func waitForPodRestart(ctx context.Context, t *testing.T, c klient.Client, podPrefix, namespace string) {
721+
t.Logf("Waiting for health-events-analyzer pod to be ready...")
722+
require.Eventually(t, func() bool {
723+
var pods v1.PodList
724+
err := c.Resources(namespace).List(ctx, &pods)
725+
if err != nil {
726+
return false
727+
}
728+
729+
for _, pod := range pods.Items {
730+
if strings.HasPrefix(pod.Name, podPrefix) && pod.Status.Phase == v1.PodRunning {
731+
for _, condition := range pod.Status.Conditions {
732+
if condition.Type == v1.PodReady && condition.Status == v1.ConditionTrue {
733+
t.Logf("Health-events-analyzer pod %s is ready", pod.Name)
734+
return true
735+
}
736+
}
737+
}
738+
}
739+
return false
740+
}, 2*time.Minute, 5*time.Second, "health-events-analyzer pod should be ready")
741+
}

tests/multiple_fatal_event_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ func TestMultipleFatalEventRule(t *testing.T) {
9292
assert.NoError(t, err, "failed to send fatal events")
9393

9494
// Check node condition for matched ruleset
95-
helpers.WaitForNodeConditionWithCheckName(ctx, t, client, gpuNodeName, "MultipleFatalError")
95+
helpers.WaitForNodeConditionWithCheckName(ctx, t, client, gpuNodeName, "MultipleFatalError", "")
9696

9797
return ctx
9898
})

0 commit comments

Comments
 (0)