Skip to content

Commit 98e2a5e

Browse files
authored
chore: simplify syslog config (#227)
1 parent 7028008 commit 98e2a5e

File tree

6 files changed

+34
-131
lines changed

6 files changed

+34
-131
lines changed

distros/kubernetes/nvsentinel/charts/syslog-health-monitor/templates/_helpers.tpl

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,6 @@ spec:
110110
image: "{{ $root.Values.global.syslogHealthMonitor.image.repository }}:{{ $root.Values.global.image.tag | default $root.Chart.AppVersion }}"
111111
imagePullPolicy: {{ $root.Values.global.image.pullPolicy }}
112112
args:
113-
- "--config-file"
114-
- "/etc/syslog-monitor/log_check_definitions.yaml"
115113
- "--polling-interval"
116114
- "{{ $root.Values.pollingInterval }}"
117115
- "--metrics-port"
@@ -122,6 +120,8 @@ spec:
122120
- "--xid-analyser-endpoint"
123121
- "http://localhost:8080"
124122
{{- end }}
123+
- "--checks"
124+
- "{{ join "," $root.Values.enabledChecks }}"
125125
resources:
126126
{{- toYaml $root.Values.resources | nindent 12 }}
127127
ports:
@@ -150,9 +150,6 @@ spec:
150150
apiVersion: v1
151151
fieldPath: spec.nodeName
152152
volumeMounts:
153-
- name: config-volume
154-
mountPath: /etc/syslog-monitor
155-
readOnly: true
156153
- name: var-run-vol
157154
mountPath: /var/run/
158155
- name: syslog-state-vol
@@ -200,9 +197,6 @@ spec:
200197
value: "8080"
201198
{{- end }}
202199
volumes:
203-
- name: config-volume
204-
configMap:
205-
name: {{ include "syslog-health-monitor.fullname" $root }}
206200
- name: var-run-vol
207201
hostPath:
208202
path: /var/run/nvsentinel

distros/kubernetes/nvsentinel/charts/syslog-health-monitor/templates/configmap.yaml

Lines changed: 0 additions & 48 deletions
This file was deleted.

distros/kubernetes/nvsentinel/charts/syslog-health-monitor/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,8 @@ driverWatcher:
7575
requests:
7676
cpu: 50m
7777
memory: 64Mi
78+
79+
enabledChecks:
80+
- SysLogsXIDError
81+
- SysLogsSXIDError
82+
- SysLogsGPUFallenOff

health-monitors/syslog-health-monitor/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ require (
1515
golang.org/x/sync v0.17.0
1616
google.golang.org/grpc v1.76.0
1717
google.golang.org/protobuf v1.36.10
18-
gopkg.in/yaml.v3 v3.0.1
1918
k8s.io/apimachinery v0.34.1
2019
)
2120

@@ -39,6 +38,7 @@ require (
3938
golang.org/x/sys v0.37.0 // indirect
4039
golang.org/x/text v0.30.0 // indirect
4140
google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect
41+
gopkg.in/yaml.v3 v3.0.1 // indirect
4242
k8s.io/klog/v2 v2.130.1 // indirect
4343
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
4444
)

health-monitors/syslog-health-monitor/main.go

Lines changed: 20 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ import (
3535
"google.golang.org/grpc"
3636
"google.golang.org/grpc/connectivity"
3737
"google.golang.org/grpc/credentials/insecure"
38-
"gopkg.in/yaml.v3"
3938
)
4039

4140
const (
@@ -52,8 +51,8 @@ var (
5251
date = "unknown"
5352

5453
// Command-line flags
55-
configFile = flag.String("config-file", "/etc/config/config.yaml",
56-
"Path to the YAML configuration file for log checks.")
54+
checksList = flag.String("checks", "SysLogsXIDError,SysLogsSXIDError,SysLogsGPUFallenOff",
55+
"Comma separated listed of checks to enable")
5756
platformConnectorSocket = flag.String("platform-connector-socket", "unix:///var/run/nvsentinel.sock",
5857
"Path to the platform-connector UDS socket.")
5958
nodeNameEnv = flag.String("node-name", os.Getenv("NODE_NAME"), "Node name. Defaults to NODE_NAME env var.")
@@ -68,10 +67,7 @@ var (
6867
"Indicates if this monitor is running in Kata Containers mode (set by DaemonSet variant).")
6968
)
7069

71-
// ConfigFile matches the top-level structure of the YAML config file
72-
type ConfigFile struct {
73-
Checks []fd.CheckDefinition `yaml:"checks"`
74-
}
70+
var checks []fd.CheckDefinition
7571

7672
func main() {
7773
logger.SetDefaultStructuredLogger(defaultAgentName, version)
@@ -121,14 +117,15 @@ func run() error {
121117

122118
client := pb.NewPlatformConnectorClient(conn)
123119

124-
slog.Info("Loading checks from config file", "file", *configFile)
125-
126-
config, err := loadConfigWithRetry(ctx, *configFile)
127-
if err != nil {
128-
return err
120+
checks = make([]fd.CheckDefinition, 0)
121+
for c := range strings.SplitSeq((*checksList), ",") {
122+
checks = append(checks, fd.CheckDefinition{
123+
Name: c,
124+
JournalPath: "/nvsentinel/var/log/journal/",
125+
})
129126
}
130127

131-
if len(config.Checks) == 0 {
128+
if len(checks) == 0 {
132129
return fmt.Errorf("no checks defined in the config file")
133130
}
134131

@@ -137,31 +134,31 @@ func run() error {
137134
slog.Info("Kata mode enabled, adding containerd service filter and removing SysLogsSXIDError check")
138135

139136
// Add containerd service filter to all checks for kata nodes
140-
for i := range config.Checks {
141-
if config.Checks[i].Tags == nil {
142-
config.Checks[i].Tags = []string{"-u", "containerd.service"}
137+
for i := range checks {
138+
if checks[i].Tags == nil {
139+
checks[i].Tags = []string{"-u", "containerd.service"}
143140
} else {
144-
config.Checks[i].Tags = append(config.Checks[i].Tags, "-u", "containerd.service")
141+
checks[i].Tags = append(checks[i].Tags, "-u", "containerd.service")
145142
}
146143
}
147144

148145
// Remove SysLogsSXIDError check for kata nodes (not supported in kata environment)
149-
filteredChecks := make([]fd.CheckDefinition, 0, len(config.Checks))
146+
filteredChecks := make([]fd.CheckDefinition, 0, len(checks))
150147

151-
for _, check := range config.Checks {
148+
for _, check := range checks {
152149
if check.Name != "SysLogsSXIDError" {
153150
filteredChecks = append(filteredChecks, check)
154151
}
155152
}
156153

157-
config.Checks = filteredChecks
154+
checks = filteredChecks
158155
}
159156

160-
slog.Info("Creating syslog monitor", "checksCount", len(config.Checks))
157+
slog.Info("Creating syslog monitor", "checksCount", len(checks))
161158

162159
fdHealthMonitor, err := fd.NewSyslogMonitor(
163160
nodeName,
164-
config.Checks,
161+
checks,
165162
client,
166163
defaultAgentName,
167164
defaultComponentClass,
@@ -211,7 +208,7 @@ func run() error {
211208
ticker := time.NewTicker(pollingInterval)
212209
defer ticker.Stop()
213210

214-
slog.Info("Configured checks", "checks", config.Checks)
211+
slog.Info("Configured checks", "checks", checks)
215212

216213
slog.Info(
217214
"Syslog health monitor initialization complete, starting polling loop...",
@@ -362,54 +359,3 @@ func waitUntilReady(parent context.Context, conn *grpc.ClientConn, timeout time.
362359
}
363360
}
364361
}
365-
366-
// loadConfigWithRetry reads and unmarshals the YAML config with bounded retries.
367-
func loadConfigWithRetry(ctx context.Context, path string) (*ConfigFile, error) {
368-
const (
369-
maxConfigRetries = 5
370-
)
371-
372-
var (
373-
yamlFile []byte
374-
err error
375-
)
376-
377-
for attempt := 1; attempt <= maxConfigRetries; attempt++ {
378-
slog.Info("Reading config file", "attempt", attempt, "maxRetries", maxConfigRetries, "file", path)
379-
380-
if _, statErr := os.Stat(path); statErr != nil {
381-
slog.Warn("Config file does not exist", "attempt", attempt, "maxRetries", maxConfigRetries, "error", statErr)
382-
383-
if attempt < maxConfigRetries {
384-
time.Sleep(time.Duration(attempt) * time.Second)
385-
continue
386-
}
387-
388-
return nil, fmt.Errorf("config file not found after retries: %w", statErr)
389-
}
390-
391-
yamlFile, err = os.ReadFile(path)
392-
if err != nil {
393-
slog.Warn("Error reading config file", "attempt", attempt, "maxRetries", maxConfigRetries, "error", err)
394-
395-
if attempt < maxConfigRetries {
396-
time.Sleep(time.Duration(attempt) * time.Second)
397-
continue
398-
}
399-
400-
return nil, fmt.Errorf("failed to read config file after retries: %w", err)
401-
}
402-
403-
slog.Info("Successfully read config file", "attempt", attempt)
404-
405-
break
406-
}
407-
408-
var config ConfigFile
409-
410-
if err := yaml.Unmarshal(yamlFile, &config); err != nil {
411-
return nil, fmt.Errorf("error unmarshalling config file: %w", err)
412-
}
413-
414-
return &config, nil
415-
}

labeler-module/pkg/labeler/labeler.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,12 @@ func (l *Labeler) handleNodeEvent(obj any) error {
454454

455455
expectedKataLabel := l.getKataLabelForNode(node)
456456

457+
currentKataLabel := node.Labels[KataEnabledLabel]
458+
if currentKataLabel == expectedKataLabel {
459+
slog.Debug("Node already has correct kata label", "node", node.Name, "kata", expectedKataLabel)
460+
return nil
461+
}
462+
457463
// Only update kata label, leave DCGM/driver labels alone
458464
return l.updateKataLabel(node.Name, expectedKataLabel)
459465
}

0 commit comments

Comments
 (0)