Skip to content

Commit 8e9900d

Browse files
authored
Merge branch 'main' into HIPPO-1446
2 parents 22fe949 + ba740da commit 8e9900d

File tree

10 files changed

+155
-5
lines changed

10 files changed

+155
-5
lines changed

.github/workflows/container-build-test.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ jobs:
5959
make_command: 'make -C health-monitors/gpu-health-monitor docker-build-dcgm4'
6060
- component: syslog-health-monitor
6161
make_command: 'make -C health-monitors/syslog-health-monitor docker-build'
62+
- component: kubernetes-object-monitor
63+
make_command: 'make -C health-monitors/kubernetes-object-monitor docker-build'
6264
# Log Collection (Docker-based)
6365
- component: log-collector
6466
make_command: 'make -C log-collector docker-build-log-collector'

.github/workflows/lint-test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ jobs:
9797
include:
9898
- component: syslog-health-monitor
9999
- component: csp-health-monitor
100+
- component: kubernetes-object-monitor
100101
- component: gpu-health-monitor
101102
install_dcgm: 'true'
102103
python_required: 'true'

health-monitors/Makefile

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ GOCOVER_COBERTURA := gocover-cobertura
1010
# Health monitor modules
1111
GO_HEALTH_MONITORS := \
1212
syslog-health-monitor \
13-
csp-health-monitor
13+
csp-health-monitor \
14+
kubernetes-object-monitor
1415

1516
PYTHON_HEALTH_MONITORS := \
1617
gpu-health-monitor
@@ -54,6 +55,10 @@ lint-test-csp-health-monitor:
5455
lint-test-gpu-health-monitor:
5556
$(MAKE) -C gpu-health-monitor lint-test
5657

58+
.PHONY: lint-test-kubernetes-object-monitor
59+
lint-test-kubernetes-object-monitor:
60+
$(MAKE) -C kubernetes-object-monitor lint-test
61+
5762
# Build targets for health monitors (delegate to module Makefiles)
5863
.PHONY: build-all
5964
build-all:
@@ -78,6 +83,10 @@ build-csp-health-monitor:
7883
build-gpu-health-monitor:
7984
$(MAKE) -C gpu-health-monitor setup
8085

86+
.PHONY: build-kubernetes-object-monitor
87+
build-kubernetes-object-monitor:
88+
$(MAKE) -C kubernetes-object-monitor build
89+
8190
# Clean targets (delegate to module Makefiles)
8291
.PHONY: clean-all
8392
clean-all:
@@ -102,6 +111,10 @@ clean-csp-health-monitor:
102111
clean-gpu-health-monitor:
103112
$(MAKE) -C gpu-health-monitor clean
104113

114+
.PHONY: clean-kubernetes-object-monitor
115+
clean-kubernetes-object-monitor:
116+
$(MAKE) -C kubernetes-object-monitor clean
117+
105118
# Help target
106119
.PHONY: help
107120
help:
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM public.ecr.aws/docker/library/golang:1.25-trixie AS builder
16+
17+
WORKDIR /go/src/nvsentinel
18+
19+
COPY health-monitors/kubernetes-object-monitor/go.mod health-monitors/kubernetes-object-monitor/go.sum health-monitors/kubernetes-object-monitor/
20+
COPY data-models/go.mod data-models/go.sum ./data-models/
21+
COPY commons/go.mod commons/go.sum ./commons/
22+
23+
RUN --mount=type=cache,target=/go/pkg/mod \
24+
cd health-monitors/kubernetes-object-monitor && go mod download
25+
26+
COPY health-monitors/kubernetes-object-monitor/ health-monitors/kubernetes-object-monitor/
27+
COPY data-models/ data-models/
28+
COPY commons/ commons/
29+
30+
RUN cd health-monitors/kubernetes-object-monitor && \
31+
CGO_ENABLED=0 go build -ldflags="-s -w" -o kubernetes-object-monitor main.go
32+
33+
FROM public.ecr.aws/docker/library/debian:bookworm-slim AS runtime
34+
35+
COPY --from=builder /go/src/nvsentinel/health-monitors/kubernetes-object-monitor/kubernetes-object-monitor /app/kubernetes-object-monitor
36+
37+
ENTRYPOINT ["/app/kubernetes-object-monitor"]
38+
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# kubernetes-object-monitor Makefile
2+
3+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
4+
5+
IS_GO_MODULE := 1
6+
HAS_DOCKER := 1
7+
8+
include ../../make/common.mk
9+
include ../../make/go.mk
10+
include ../../make/docker.mk
11+
12+
.PHONY: all
13+
all: lint-test
14+
15+
.PHONY: help
16+
help:
17+
@echo "kubernetes-object-monitor Makefile - Using nvsentinel make/*.mk standards"
18+
@echo ""
19+
@echo "Main targets: all, lint-test, ci-test, build, test, lint, clean"
20+
@echo "Docker targets: docker, docker-build, docker-publish"
21+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
module github.com/nvidia/nvsentinel/health-monitors/kubernetes-object-monitor
2+
3+
go 1.25.4
4+
5+
require github.com/nvidia/nvsentinel/commons v0.0.0
6+
7+
replace github.com/nvidia/nvsentinel/commons => ../../commons

health-monitors/kubernetes-object-monitor/go.sum

Whitespace-only changes.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
package main
15+
16+
import (
17+
"context"
18+
"log/slog"
19+
"os"
20+
"os/signal"
21+
"syscall"
22+
23+
"github.com/nvidia/nvsentinel/commons/pkg/logger"
24+
)
25+
26+
const (
27+
defaultAgentName = "kubernetes-object-monitor"
28+
)
29+
30+
var (
31+
version = "dev"
32+
commit = "none"
33+
date = "unknown"
34+
)
35+
36+
func main() {
37+
logLevel := os.Getenv("LOG_LEVEL")
38+
if logLevel == "" {
39+
logLevel = "info"
40+
}
41+
42+
logger.SetDefaultStructuredLogger(defaultAgentName, version)
43+
slog.Info("Starting kubernetes-object-monitor", "version",
44+
version, "commit", commit, "date", date, "log_level", logLevel)
45+
46+
if err := run(); err != nil {
47+
slog.Error("Fatal error", "error", err)
48+
os.Exit(1)
49+
}
50+
}
51+
52+
func run() error {
53+
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
54+
defer stop()
55+
56+
slog.Info("Kubernetes Object Monitor started successfully")
57+
slog.Info("TODO: Implement controller-runtime based monitor with CEL policy evaluation")
58+
59+
<-ctx.Done()
60+
61+
slog.Info("Shutdown signal received, exiting gracefully")
62+
63+
return nil
64+
}

tests/data/health-events-analyzer-config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ data:
77
config.toml: |
88
[[rules]]
99
name = "MultipleRemediations"
10-
description = "Detect if multiple remediations are performed within 3 minutes on a node"
10+
description = "Detect if multiple remediations are performed within 1.5 minutes on a node"
1111
recommended_action = "CONTACT_SUPPORT"
1212
1313
stage = [
@@ -17,7 +17,7 @@ data:
1717
"$expr": {
1818
"$gte": [
1919
"$healthevent.generatedtimestamp.seconds",
20-
{"$subtract": [{"$divide": [{"$toLong": "$$NOW"}, 1000]}, 180]}
20+
{"$subtract": [{"$divide": [{"$toLong": "$$NOW"}, 1000]}, 90]}
2121
]
2222
}
2323
}
@@ -38,7 +38,7 @@ data:
3838
3939
[[rules]]
4040
name = "RepeatedXidError"
41-
description = "Detect occurrence of fatal XIDs 2 times within 3 minutes where the burst window is 10 seconds and sticky XIDs window is 20seconds"
41+
description = "Detect occurrence of fatal XIDs 2 times within 2 minutes where the burst window is 10 seconds and sticky XIDs window is 20seconds"
4242
recommended_action = "CONTACT_SUPPORT"
4343
stage = [
4444
'''
@@ -47,7 +47,7 @@ data:
4747
"$expr": {
4848
"$gte": [
4949
"$healthevent.generatedtimestamp.seconds",
50-
{"$subtract": [{"$divide": [{"$toLong": "$$NOW"}, 1000]}, 180]}
50+
{"$subtract": [{"$divide": [{"$toLong": "$$NOW"}, 1000]}, 120]}
5151
]
5252
}
5353
}

tests/health_events_analyzer_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ func TestMultipleRemediationsCompleted(t *testing.T) {
3737
var newCtx context.Context
3838
newCtx, testCtx = helpers.SetupHealthEventsAnalyzerTest(ctx, t, c, "data/health-events-analyzer-config.yaml", "health-events-analyzer-test")
3939

40+
t.Log("Waiting 90 seconds for the MultipleRemediations rule time window to complete")
41+
time.Sleep(90 * time.Second)
42+
43+
t.Log("Triggering multiple remediations cycle")
4044
client, err := c.NewClient()
4145
require.NoError(t, err)
4246
helpers.TriggerMultipleRemediationsCycle(ctx, t, client, testCtx.NodeName)

0 commit comments

Comments
 (0)