@@ -35,7 +35,6 @@ import (
3535 "google.golang.org/grpc"
3636 "google.golang.org/grpc/connectivity"
3737 "google.golang.org/grpc/credentials/insecure"
38- "gopkg.in/yaml.v3"
3938)
4039
4140const (
5251 date = "unknown"
5352
5453 // Command-line flags
55- configFile = flag .String ("config-file " , "/etc/config/config.yaml " ,
56- "Path to the YAML configuration file for log checks. " )
54+ checksList = flag .String ("checks " , "SysLogsXIDError,SysLogsSXIDError,SysLogsGPUFallenOff " ,
55+ "Comma separated listed of checks to enable " )
5756 platformConnectorSocket = flag .String ("platform-connector-socket" , "unix:///var/run/nvsentinel.sock" ,
5857 "Path to the platform-connector UDS socket." )
5958 nodeNameEnv = flag .String ("node-name" , os .Getenv ("NODE_NAME" ), "Node name. Defaults to NODE_NAME env var." )
6867 "Indicates if this monitor is running in Kata Containers mode (set by DaemonSet variant)." )
6968)
7069
71- // ConfigFile matches the top-level structure of the YAML config file
72- type ConfigFile struct {
73- Checks []fd.CheckDefinition `yaml:"checks"`
74- }
70+ var checks []fd.CheckDefinition
7571
7672func main () {
7773 logger .SetDefaultStructuredLogger (defaultAgentName , version )
@@ -121,14 +117,15 @@ func run() error {
121117
122118 client := pb .NewPlatformConnectorClient (conn )
123119
124- slog .Info ("Loading checks from config file" , "file" , * configFile )
125-
126- config , err := loadConfigWithRetry (ctx , * configFile )
127- if err != nil {
128- return err
120+ checks = make ([]fd.CheckDefinition , 0 )
121+ for c := range strings .SplitSeq ((* checksList ), "," ) {
122+ checks = append (checks , fd.CheckDefinition {
123+ Name : c ,
124+ JournalPath : "/nvsentinel/var/log/journal/" ,
125+ })
129126 }
130127
131- if len (config . Checks ) == 0 {
128+ if len (checks ) == 0 {
132129 return fmt .Errorf ("no checks defined in the config file" )
133130 }
134131
@@ -137,31 +134,31 @@ func run() error {
137134 slog .Info ("Kata mode enabled, adding containerd service filter and removing SysLogsSXIDError check" )
138135
139136 // Add containerd service filter to all checks for kata nodes
140- for i := range config . Checks {
141- if config . Checks [i ].Tags == nil {
142- config . Checks [i ].Tags = []string {"-u" , "containerd.service" }
137+ for i := range checks {
138+ if checks [i ].Tags == nil {
139+ checks [i ].Tags = []string {"-u" , "containerd.service" }
143140 } else {
144- config . Checks [i ].Tags = append (config . Checks [i ].Tags , "-u" , "containerd.service" )
141+ checks [i ].Tags = append (checks [i ].Tags , "-u" , "containerd.service" )
145142 }
146143 }
147144
148145 // Remove SysLogsSXIDError check for kata nodes (not supported in kata environment)
149- filteredChecks := make ([]fd.CheckDefinition , 0 , len (config . Checks ))
146+ filteredChecks := make ([]fd.CheckDefinition , 0 , len (checks ))
150147
151- for _ , check := range config . Checks {
148+ for _ , check := range checks {
152149 if check .Name != "SysLogsSXIDError" {
153150 filteredChecks = append (filteredChecks , check )
154151 }
155152 }
156153
157- config . Checks = filteredChecks
154+ checks = filteredChecks
158155 }
159156
160- slog .Info ("Creating syslog monitor" , "checksCount" , len (config . Checks ))
157+ slog .Info ("Creating syslog monitor" , "checksCount" , len (checks ))
161158
162159 fdHealthMonitor , err := fd .NewSyslogMonitor (
163160 nodeName ,
164- config . Checks ,
161+ checks ,
165162 client ,
166163 defaultAgentName ,
167164 defaultComponentClass ,
@@ -211,7 +208,7 @@ func run() error {
211208 ticker := time .NewTicker (pollingInterval )
212209 defer ticker .Stop ()
213210
214- slog .Info ("Configured checks" , "checks" , config . Checks )
211+ slog .Info ("Configured checks" , "checks" , checks )
215212
216213 slog .Info (
217214 "Syslog health monitor initialization complete, starting polling loop..." ,
@@ -362,54 +359,3 @@ func waitUntilReady(parent context.Context, conn *grpc.ClientConn, timeout time.
362359 }
363360 }
364361}
365-
366- // loadConfigWithRetry reads and unmarshals the YAML config with bounded retries.
367- func loadConfigWithRetry (ctx context.Context , path string ) (* ConfigFile , error ) {
368- const (
369- maxConfigRetries = 5
370- )
371-
372- var (
373- yamlFile []byte
374- err error
375- )
376-
377- for attempt := 1 ; attempt <= maxConfigRetries ; attempt ++ {
378- slog .Info ("Reading config file" , "attempt" , attempt , "maxRetries" , maxConfigRetries , "file" , path )
379-
380- if _ , statErr := os .Stat (path ); statErr != nil {
381- slog .Warn ("Config file does not exist" , "attempt" , attempt , "maxRetries" , maxConfigRetries , "error" , statErr )
382-
383- if attempt < maxConfigRetries {
384- time .Sleep (time .Duration (attempt ) * time .Second )
385- continue
386- }
387-
388- return nil , fmt .Errorf ("config file not found after retries: %w" , statErr )
389- }
390-
391- yamlFile , err = os .ReadFile (path )
392- if err != nil {
393- slog .Warn ("Error reading config file" , "attempt" , attempt , "maxRetries" , maxConfigRetries , "error" , err )
394-
395- if attempt < maxConfigRetries {
396- time .Sleep (time .Duration (attempt ) * time .Second )
397- continue
398- }
399-
400- return nil , fmt .Errorf ("failed to read config file after retries: %w" , err )
401- }
402-
403- slog .Info ("Successfully read config file" , "attempt" , attempt )
404-
405- break
406- }
407-
408- var config ConfigFile
409-
410- if err := yaml .Unmarshal (yamlFile , & config ); err != nil {
411- return nil , fmt .Errorf ("error unmarshalling config file: %w" , err )
412- }
413-
414- return & config , nil
415- }
0 commit comments