diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7bcd93e60..bda7eef44 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -59,6 +59,13 @@ updates: interval: "weekly" labels: ["dependencies"] + - package-ecosystem: "gomod" + directory: "/commons" + target-branch: "main" + schedule: + interval: "weekly" + labels: ["dependencies"] + # Fault Management - package-ecosystem: "gomod" directory: "/fault-quarantine-module" diff --git a/commons/Makefile b/commons/Makefile index 97770cdc2..c2397e0cd 100644 --- a/commons/Makefile +++ b/commons/Makefile @@ -1,4 +1,4 @@ -# Logger SDK Makefile +# Commons Makefile # Individual module build and test targets # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. @@ -40,8 +40,8 @@ all: lint-test ## Run lint-test (default target) # ============================================================================= # MODULE NOTES # ============================================================================= -# This is a library module providing centralized logging initialization. -# - Library module providing structured logging utilities +# This is a library module providing common utilities for NVSentinel. +# - Library module providing structured logging and configuration management # - No binary output or Docker builds -# - Used by all nvsentinel modules for consistent logging setup +# - Used by all nvsentinel modules for consistent logging and config loading # Run 'make help' to see available targets diff --git a/commons/go.mod b/commons/go.mod index f0d805ce7..4f78bc1b6 100644 --- a/commons/go.mod +++ b/commons/go.mod @@ -5,6 +5,7 @@ go 1.25 toolchain go1.25.3 require ( + github.com/BurntSushi/toml v1.5.0 github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 golang.org/x/sync v0.17.0 diff --git a/commons/go.sum b/commons/go.sum index 25754693d..3503e76c2 100644 --- a/commons/go.sum +++ b/commons/go.sum @@ -1,3 +1,5 @@ +github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= +github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= diff --git a/commons/pkg/configmanager/env.go b/commons/pkg/configmanager/env.go new file mode 100644 index 000000000..fcc02fa89 --- /dev/null +++ b/commons/pkg/configmanager/env.go @@ -0,0 +1,244 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package configmanager + +import ( + "fmt" + "math" + "os" + "strconv" + "strings" +) + +// GetEnvVar retrieves an environment variable and converts it to type T. +// Type must be explicitly specified: GetEnvVar[int]("PORT", nil, nil) +// If defaultValue is nil, the environment variable is required. +// If defaultValue is non-nil, it will be used when the environment variable is not set. +// Optional validator function validates the final value (from env or default). +// +// Supported types: +// - int +// - uint +// - float64 +// - bool (accepts: "true" or "false", case-insensitive) +// - string +// +// Example usage: +// +// // Required env var +// port, err := configmanager.GetEnvVar[int]("PORT", nil, nil) +// +// // With default value +// defaultTimeout := 30 +// timeout, err := configmanager.GetEnvVar[int]("TIMEOUT", &defaultTimeout, nil) +// +// // With default and validation +// defaultMaxConn := 100 +// maxConn, err := configmanager.GetEnvVar[int]("MAX_CONN", &defaultMaxConn, func(v int) error { +// if v <= 0 { return fmt.Errorf("must be positive") } +// return nil +// }) +// +// // Required with validation +// workers, err := configmanager.GetEnvVar[int]("WORKERS", nil, func(v int) error { +// if v <= 0 { return fmt.Errorf("must be positive") } +// return nil +// }) +func GetEnvVar[T any](name string, defaultValue *T, validator func(T) error) (T, error) { + var zero T + + valueStr, exists := os.LookupEnv(name) + if !exists { + return handleMissingEnvVarWithDefault(name, defaultValue, validator) + } + + value, err := parseValue[T](valueStr) + if err != nil { + return zero, fmt.Errorf("error converting %s: %w", name, err) + } + + if validator != nil { + if err := validator(value); err != nil { + return zero, fmt.Errorf("validation failed for %s: %w", name, err) + } + } + + return value, nil +} + +func handleMissingEnvVarWithDefault[T any](name string, defaultValue *T, validator func(T) error) (T, error) { + var zero T + + if defaultValue == nil { + return zero, fmt.Errorf("environment variable %s is not set", name) + } + + if validator != nil { + if err := validator(*defaultValue); err != nil { + return zero, fmt.Errorf("validation failed for default value of %s: %w", name, err) + } + } + + return *defaultValue, nil +} + +func parseValue[T any](valueStr string) (T, error) { + var zero T + + switch any(zero).(type) { + case string: + return any(valueStr).(T), nil + case int: + return parseAndConvert[T](parseInt(valueStr)) + case uint: + return parseAndConvert[T](parseUint(valueStr)) + case float64: + return parseAndConvert[T](parseFloat64(valueStr)) + case bool: + return parseAndConvert[T](parseBool(valueStr)) + default: + return zero, fmt.Errorf("unsupported type %T", zero) + } +} + +func parseAndConvert[T any](value any, err error) (T, error) { + var zero T + if err != nil { + return zero, err + } + + return any(value).(T), nil +} + +func parseInt(valueStr string) (int, error) { + v, err := strconv.ParseInt(valueStr, 10, 64) + if err != nil { + return 0, err + } + + if v < math.MinInt || v > math.MaxInt { + return 0, fmt.Errorf("value %d out of range for int type", v) + } + + return int(v), nil +} + +func parseUint(valueStr string) (uint, error) { + v, err := strconv.ParseUint(valueStr, 10, 64) + if err != nil { + return 0, err + } + + if v > math.MaxUint { + return 0, fmt.Errorf("value %d out of range for uint type", v) + } + + return uint(v), nil +} + +func parseFloat64(valueStr string) (float64, error) { + return strconv.ParseFloat(valueStr, 64) +} + +// parseBool parses boolean values (accepts "true" or "false") +func parseBool(valueStr string) (bool, error) { + valueStr = strings.ToLower(strings.TrimSpace(valueStr)) + + switch valueStr { + case "true": + return true, nil + case "false": + return false, nil + default: + return false, fmt.Errorf("invalid boolean value: %s (must be 'true' or 'false')", valueStr) + } +} + +// EnvVarSpec defines a specification for reading an environment variable. +// All fields except Name are optional. +// +// Example usage: +// +// specs := []configmanager.EnvVarSpec{ +// {Name: "DATABASE_URL"}, // Required by default +// {Name: "PORT", Optional: true, DefaultValue: "5432"}, // Optional with default +// } +// envVars, errors := configmanager.ReadEnvVars(specs) +// if len(errors) > 0 { +// return fmt.Errorf("missing required vars: %v", errors) +// } +type EnvVarSpec struct { + Name string // Required: The environment variable name to read + Optional bool // Optional: If true, env var is optional; if false, it's required (default: false/required) + // DefaultValue is used when env var is not set. + // Empty string defaults are treated as "no value" and excluded from results map. + DefaultValue string +} + +// ReadEnvVars reads multiple environment variables based on the provided specifications. +// Returns a map of environment variable names to their values and a slice of errors. +// Environment variables are required by default unless Optional is set to true. +// +// Example usage: +// +// specs := []configmanager.EnvVarSpec{ +// {Name: "MONGODB_URI"}, // Required +// {Name: "MONGODB_DATABASE_NAME"}, // Required +// {Name: "MONGODB_PORT", Optional: true, DefaultValue: "27017"}, // Included with default +// {Name: "DEBUG_MODE", Optional: true, DefaultValue: ""}, // NOT included (empty default) +// } +// envVars, errors := configmanager.ReadEnvVars(specs) +// if len(errors) > 0 { +// log.Fatalf("Missing required environment variables: %v", errors) +// } +// // Use the values +// dbURI := envVars["MONGODB_URI"] +// dbName := envVars["MONGODB_DATABASE_NAME"] +// dbPort := envVars["MONGODB_PORT"] // Will be "27017" if not set +func ReadEnvVars(specs []EnvVarSpec) (map[string]string, []error) { + results := make(map[string]string) + + var errors []error + + for _, spec := range specs { + value, exists := os.LookupEnv(spec.Name) + + if !exists { + defaultVal, err := handleMissingEnvVar(spec) + if err != nil { + errors = append(errors, err) + } + + // Only include non-empty defaults in results map to distinguish "not set" from "set to empty" + if defaultVal != "" { + results[spec.Name] = defaultVal + } + + continue + } + + results[spec.Name] = value + } + + return results, errors +} + +func handleMissingEnvVar(spec EnvVarSpec) (string, error) { + if spec.Optional { + return spec.DefaultValue, nil + } + + return "", fmt.Errorf("required environment variable %s is not set", spec.Name) +} diff --git a/commons/pkg/configmanager/env_test.go b/commons/pkg/configmanager/env_test.go new file mode 100644 index 000000000..7fa913ed3 --- /dev/null +++ b/commons/pkg/configmanager/env_test.go @@ -0,0 +1,408 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package configmanager + +import ( + "fmt" + "math" + "strconv" + "testing" +) + +func TestReadEnvVars(t *testing.T) { + t.Setenv("TEST_VAR_1", "value1") + t.Setenv("TEST_VAR_2", "value2") + + specs := []EnvVarSpec{ + { + Name: "TEST_VAR_1", // Required by default + }, + { + Name: "TEST_VAR_2", + Optional: true, // Explicitly optional + }, + { + Name: "TEST_VAR_3", + Optional: true, // Optional with default + DefaultValue: "default", + }, + { + Name: "TEST_VAR_4", // Required by default, but missing + }, + } + + results, errors := ReadEnvVars(specs) + + if len(errors) != 1 { + t.Errorf("expected 1 error, got %d", len(errors)) + } + + if results["TEST_VAR_1"] != "value1" { + t.Errorf("expected value1, got %s", results["TEST_VAR_1"]) + } + + if results["TEST_VAR_2"] != "value2" { + t.Errorf("expected value2, got %s", results["TEST_VAR_2"]) + } + + if results["TEST_VAR_3"] != "default" { + t.Errorf("expected default, got %s", results["TEST_VAR_3"]) + } + + if _, exists := results["TEST_VAR_4"]; exists { + t.Error("TEST_VAR_4 should not be in results map when required and missing") + } +} + +func TestReadEnvVarsOptionalWithEmptyDefault(t *testing.T) { + specs := []EnvVarSpec{ + { + Name: "MISSING_OPTIONAL_EMPTY_DEFAULT", + Optional: true, + DefaultValue: "", + }, + { + Name: "MISSING_OPTIONAL_WITH_DEFAULT", + Optional: true, + DefaultValue: "some_value", + }, + } + + results, errors := ReadEnvVars(specs) + + if len(errors) != 0 { + t.Errorf("expected 0 errors, got %d", len(errors)) + } + + if _, exists := results["MISSING_OPTIONAL_EMPTY_DEFAULT"]; exists { + t.Error("optional var with empty default should not be in results map") + } + + if val, exists := results["MISSING_OPTIONAL_WITH_DEFAULT"]; !exists || val != "some_value" { + t.Errorf("optional var with non-empty default should be in results map with value 'some_value', got %v", val) + } +} + +func TestGetEnvVar(t *testing.T) { + t.Run("required with validation", func(t *testing.T) { + t.Setenv("TEST_REQUIRED", "42") + + value, err := GetEnvVar("TEST_REQUIRED", nil, func(v int) error { + if v <= 0 { + return fmt.Errorf("must be positive") + } + return nil + }) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if value != 42 { + t.Errorf("expected 42, got %d", value) + } + }) + + t.Run("missing required returns error", func(t *testing.T) { + _, err := GetEnvVar[int]("TEST_MISSING_REQUIRED", nil, nil) + if err == nil { + t.Error("expected error for missing env var but got none") + } + }) + + t.Run("with default value", func(t *testing.T) { + defaultVal := 99 + value, err := GetEnvVar("TEST_WITH_DEFAULT", &defaultVal, nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if value != 99 { + t.Errorf("expected 99, got %d", value) + } + }) + + t.Run("with default and validation", func(t *testing.T) { + t.Setenv("TEST_DEFAULT_VAL", "42") + + defaultVal := 10 + value, err := GetEnvVar("TEST_DEFAULT_VAL", &defaultVal, func(v int) error { + if v <= 0 { + return fmt.Errorf("must be positive") + } + return nil + }) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if value != 42 { + t.Errorf("expected 42, got %d", value) + } + }) + + t.Run("validation failure", func(t *testing.T) { + t.Setenv("TEST_VALIDATION_FAIL", "5") + + _, err := GetEnvVar("TEST_VALIDATION_FAIL", nil, func(v int) error { + if v <= 10 { + return fmt.Errorf("must be greater than 10") + } + return nil + }) + if err == nil { + t.Error("expected validation error but got none") + } + }) + + t.Run("different types work", func(t *testing.T) { + t.Setenv("TEST_STRING", "hello") + t.Setenv("TEST_BOOL", "true") + t.Setenv("TEST_FLOAT", "3.14") + + strVal, err := GetEnvVar[string]("TEST_STRING", nil, nil) + if err != nil || strVal != "hello" { + t.Errorf("string test failed: %v, got %s", err, strVal) + } + + boolVal, err := GetEnvVar[bool]("TEST_BOOL", nil, nil) + if err != nil || !boolVal { + t.Errorf("bool test failed: %v, got %v", err, boolVal) + } + + floatVal, err := GetEnvVar[float64]("TEST_FLOAT", nil, nil) + if err != nil || floatVal != 3.14 { + t.Errorf("float test failed: %v, got %f", err, floatVal) + } + }) +} + +func TestGetEnvVarAllSupportedTypes(t *testing.T) { + t.Run("int type", func(t *testing.T) { + t.Setenv("TEST_INT", "42") + + value, err := GetEnvVar[int]("TEST_INT", nil, nil) + if err != nil || value != 42 { + t.Errorf("int test failed: %v, got %d", err, value) + } + }) + + t.Run("uint type", func(t *testing.T) { + t.Setenv("TEST_UINT", "4294967295") + + value, err := GetEnvVar[uint]("TEST_UINT", nil, nil) + if err != nil || value != 4294967295 { + t.Errorf("uint test failed: %v, got %d", err, value) + } + }) + + t.Run("float64 type", func(t *testing.T) { + t.Setenv("TEST_FLOAT64", "3.14159") + + value, err := GetEnvVar[float64]("TEST_FLOAT64", nil, nil) + if err != nil || value != 3.14159 { + t.Errorf("float64 test failed: %v, got %f", err, value) + } + }) + + t.Run("bool type", func(t *testing.T) { + t.Setenv("TEST_BOOL", "true") + + value, err := GetEnvVar[bool]("TEST_BOOL", nil, nil) + if err != nil || !value { + t.Errorf("bool test failed: %v, got %v", err, value) + } + }) + + t.Run("string type", func(t *testing.T) { + t.Setenv("TEST_STRING_TYPE", "hello world") + + value, err := GetEnvVar[string]("TEST_STRING_TYPE", nil, nil) + if err != nil || value != "hello world" { + t.Errorf("string test failed: %v, got %s", err, value) + } + }) +} + +func TestParseIntBoundsChecking(t *testing.T) { + tests := []struct { + name string + input string + expectError bool + }{ + { + name: "valid positive int", + input: "42", + expectError: false, + }, + { + name: "valid negative int", + input: "-42", + expectError: false, + }, + { + name: "max int", + input: strconv.FormatInt(int64(math.MaxInt), 10), + expectError: false, + }, + { + name: "min int", + input: strconv.FormatInt(int64(math.MinInt), 10), + expectError: false, + }, + { + name: "invalid non-numeric string", + input: "not-a-number", + expectError: true, + }, + { + name: "empty string", + input: "", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := parseInt(tt.input) + if tt.expectError { + if err == nil { + t.Errorf("expected error but got none for input %q", tt.input) + } + } else { + if err != nil { + t.Errorf("unexpected error for input %q: %v", tt.input, err) + } + expectedInt, parseErr := strconv.Atoi(tt.input) + if parseErr != nil { + t.Fatalf("test setup error: strconv.Atoi failed for valid input %q: %v", tt.input, parseErr) + } + if result != expectedInt { + t.Errorf("expected %d, got %d", expectedInt, result) + } + } + }) + } +} + +func TestParseUintBoundsChecking(t *testing.T) { + tests := []struct { + name string + input string + expectError bool + }{ + { + name: "valid uint", + input: "42", + expectError: false, + }, + { + name: "zero", + input: "0", + expectError: false, + }, + { + name: "max uint", + input: strconv.FormatUint(uint64(math.MaxUint), 10), + expectError: false, + }, + { + name: "negative number", + input: "-1", + expectError: true, + }, + { + name: "invalid non-numeric string", + input: "not-a-number", + expectError: true, + }, + { + name: "empty string", + input: "", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := parseUint(tt.input) + if tt.expectError { + if err == nil { + t.Errorf("expected error but got none for input %q", tt.input) + } + } else { + if err != nil { + t.Errorf("unexpected error for input %q: %v", tt.input, err) + } + expectedUint, parseErr := strconv.ParseUint(tt.input, 10, 64) + if parseErr != nil { + t.Fatalf("test setup error: strconv.ParseUint failed for valid input %q: %v", tt.input, parseErr) + } + if result != uint(expectedUint) { + t.Errorf("expected %d, got %d", expectedUint, result) + } + } + }) + } +} + +func TestGetEnvVarBoundsValidation(t *testing.T) { + t.Run("int within valid range", func(t *testing.T) { + t.Setenv("TEST_VALID_INT", "100") + value, err := GetEnvVar[int]("TEST_VALID_INT", nil, nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if value != 100 { + t.Errorf("expected 100, got %d", value) + } + }) + + t.Run("uint within valid range", func(t *testing.T) { + t.Setenv("TEST_VALID_UINT", "100") + value, err := GetEnvVar[uint]("TEST_VALID_UINT", nil, nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if value != 100 { + t.Errorf("expected 100, got %d", value) + } + }) + + t.Run("negative uint should fail", func(t *testing.T) { + t.Setenv("TEST_NEGATIVE_UINT", "-1") + _, err := GetEnvVar[uint]("TEST_NEGATIVE_UINT", nil, nil) + if err == nil { + t.Error("expected error for negative uint but got none") + } + }) + + t.Run("invalid int string should fail", func(t *testing.T) { + t.Setenv("TEST_INVALID_INT", "not-a-number") + _, err := GetEnvVar[int]("TEST_INVALID_INT", nil, nil) + if err == nil { + t.Error("expected error for invalid int string but got none") + } + }) + + t.Run("default value validation failure", func(t *testing.T) { + invalidDefault := -5 + _, err := GetEnvVar("TEST_MISSING_WITH_INVALID_DEFAULT", &invalidDefault, func(v int) error { + if v <= 0 { + return fmt.Errorf("must be positive") + } + return nil + }) + if err == nil { + t.Error("expected validation error for invalid default value but got none") + } + }) +} diff --git a/commons/pkg/configmanager/loader.go b/commons/pkg/configmanager/loader.go new file mode 100644 index 000000000..76670b63a --- /dev/null +++ b/commons/pkg/configmanager/loader.go @@ -0,0 +1,54 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package configmanager + +import ( + "fmt" + + "github.com/BurntSushi/toml" +) + +// LoadTOMLConfig loads configuration from a TOML file into the provided config struct. +// The config parameter should be a pointer to a struct with TOML tags. +// Type is inferred from the config parameter. +// +// Example usage: +// +// type Config struct { +// ServerPort int `toml:"serverPort"` +// LogLevel string `toml:"logLevel"` +// Features []string `toml:"features"` +// } +// +// var cfg Config +// err := configmanager.LoadTOMLConfig("/etc/config.toml", &cfg) +// if err != nil { +// log.Fatalf("Failed to load config: %v", err) +// } +// +// // Validate and set defaults after loading +// if cfg.ServerPort == 0 { +// cfg.ServerPort = 8080 +// } +// if cfg.ServerPort < 1 || cfg.ServerPort > 65535 { +// return fmt.Errorf("ServerPort must be between 1 and 65535") +// } +func LoadTOMLConfig[T any](path string, config *T) error { + if _, err := toml.DecodeFile(path, config); err != nil { + return fmt.Errorf("failed to decode TOML file %s: %w", path, err) + } + + return nil +} diff --git a/commons/pkg/configmanager/loader_test.go b/commons/pkg/configmanager/loader_test.go new file mode 100644 index 000000000..ceeb40967 --- /dev/null +++ b/commons/pkg/configmanager/loader_test.go @@ -0,0 +1,95 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package configmanager + +import ( + "os" + "path/filepath" + "testing" +) + +type testTOMLConfig struct { + Name string `toml:"name"` + Port int `toml:"port"` + Enabled bool `toml:"enabled"` +} + +func TestLoadTOMLConfig(t *testing.T) { + t.Parallel() + + tomlContent := `name = "test" +port = 8080 +enabled = true +` + + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + if err := os.WriteFile(configPath, []byte(tomlContent), 0600); err != nil { + t.Fatalf("failed to write test config: %v", err) + } + + var cfg testTOMLConfig + + err := LoadTOMLConfig(configPath, &cfg) + if err != nil { + t.Fatalf("failed to load TOML config: %v", err) + } + + if cfg.Name != "test" { + t.Errorf("expected name 'test', got '%s'", cfg.Name) + } + + if cfg.Port != 8080 { + t.Errorf("expected port 8080, got %d", cfg.Port) + } + + if !cfg.Enabled { + t.Error("expected enabled to be true") + } +} + +func TestLoadTOMLConfigNonExistentFile(t *testing.T) { + t.Parallel() + + var cfg testTOMLConfig + + nonExistentPath := filepath.Join(t.TempDir(), "nonexistent.toml") + err := LoadTOMLConfig(nonExistentPath, &cfg) + if err == nil { + t.Fatal("expected error for non-existent file, got nil") + } +} + +func TestLoadTOMLConfigInvalidSyntax(t *testing.T) { + t.Parallel() + + invalidTOML := `name = "test" +port = this is not valid toml +enabled = true +` + + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "invalid_config.toml") + if err := os.WriteFile(configPath, []byte(invalidTOML), 0600); err != nil { + t.Fatalf("failed to write test config: %v", err) + } + + var cfg testTOMLConfig + + err := LoadTOMLConfig(configPath, &cfg) + if err == nil { + t.Fatal("expected error for invalid TOML syntax, got nil") + } +} diff --git a/fault-quarantine-module/Makefile b/fault-quarantine-module/Makefile index 7cfd467a9..cf01cf21b 100644 --- a/fault-quarantine-module/Makefile +++ b/fault-quarantine-module/Makefile @@ -26,6 +26,11 @@ IS_KO_MODULE := 1 # Fault-quarantine-module specific settings CLEAN_EXTRA_FILES := fault-quarantine-module +# Test setup commands for kubebuilder envtest +TEST_SETUP_COMMANDS := \ + go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest && \ + eval $$(setup-envtest use --use-env -p env) && + # ============================================================================= # INCLUDE SHARED DEFINITIONS # ============================================================================= diff --git a/fault-quarantine-module/go.mod b/fault-quarantine-module/go.mod index 3e39026c8..ffa242586 100644 --- a/fault-quarantine-module/go.mod +++ b/fault-quarantine-module/go.mod @@ -5,13 +5,13 @@ go 1.25 toolchain go1.25.3 require ( - github.com/BurntSushi/toml v1.5.0 github.com/google/cel-go v0.26.0 github.com/hashicorp/go-multierror v1.1.1 github.com/nvidia/nvsentinel/commons v0.0.0 github.com/nvidia/nvsentinel/data-models v0.0.0 github.com/nvidia/nvsentinel/store-client-sdk v0.0.0 github.com/prometheus/client_golang v1.23.2 + github.com/stretchr/testify v1.11.1 go.mongodb.org/mongo-driver v1.17.4 golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 golang.org/x/sync v0.17.0 @@ -19,15 +19,18 @@ require ( k8s.io/api v0.34.1 k8s.io/apimachinery v0.34.1 k8s.io/client-go v0.34.1 + sigs.k8s.io/controller-runtime v0.22.3 ) require ( cel.dev/expr v0.24.0 // indirect + github.com/BurntSushi/toml v1.5.0 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.13.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-openapi/jsonpointer v0.22.1 // indirect @@ -48,6 +51,7 @@ require ( github.com/golang/snappy v1.0.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect + github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d // indirect github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -56,6 +60,8 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/montanaflynn/stats v0.7.1 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/onsi/ginkgo/v2 v2.26.0 // indirect + github.com/onsi/gomega v1.38.2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.1 // indirect @@ -67,6 +73,8 @@ require ( github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.43.0 // indirect @@ -76,11 +84,14 @@ require ( golang.org/x/term v0.36.0 // indirect golang.org/x/text v0.30.0 // indirect golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.38.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect google.golang.org/grpc v1.76.0 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apiextensions-apiserver v0.34.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect @@ -90,7 +101,6 @@ require ( sigs.k8s.io/yaml v1.6.0 // indirect ) -// Local replacements for internal modules replace github.com/nvidia/nvsentinel/store-client-sdk => ../store-client-sdk replace github.com/nvidia/nvsentinel/data-models => ../data-models diff --git a/fault-quarantine-module/go.sum b/fault-quarantine-module/go.sum index 249a945d8..b7b251daa 100644 --- a/fault-quarantine-module/go.sum +++ b/fault-quarantine-module/go.sum @@ -2,6 +2,8 @@ cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -14,12 +16,18 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= github.com/go-openapi/jsonpointer v0.22.1 h1:sHYI1He3b9NqJ4wXLoJDKmUmHkWy/L7rtEo92JUxBNk= github.com/go-openapi/jsonpointer v0.22.1/go.mod h1:pQT9OsLkfz1yWoMgYFy4x3U5GY5nUlsOn1qSBH5MkCM= github.com/go-openapi/jsonreference v0.21.2 h1:Wxjda4M/BBQllegefXrY/9aq1fxBA8sI5M/lFU6tSWU= @@ -65,8 +73,8 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d h1:KJIErDwbSHjnp/SGzE5ed8Aol7JsKiI5X7yWKAtzhM0= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -96,10 +104,10 @@ github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8 github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= -github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= -github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE= +github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -147,16 +155,22 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= @@ -172,6 +186,8 @@ golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbR golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -214,12 +230,14 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= -golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b h1:ULiyYQ0FdsJhwwZUwbaXpZF5yUE3h+RA+gxvBu37ucc= @@ -242,6 +260,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= +k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= +k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= @@ -252,6 +272,8 @@ k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZ k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.22.3 h1:I7mfqz/a/WdmDCEnXmSPm8/b/yRTy6JsKKENTijTq8Y= +sigs.k8s.io/controller-runtime v0.22.3/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= diff --git a/fault-quarantine-module/main.go b/fault-quarantine-module/main.go index 2a38e9fb3..c56735244 100644 --- a/fault-quarantine-module/main.go +++ b/fault-quarantine-module/main.go @@ -21,20 +21,14 @@ import ( "log/slog" "os" "os/signal" - "path/filepath" "strconv" "syscall" "time" "github.com/nvidia/nvsentinel/commons/pkg/logger" "github.com/nvidia/nvsentinel/commons/pkg/server" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/reconciler" - "github.com/nvidia/nvsentinel/store-client-sdk/pkg/storewatcher" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/initializer" "golang.org/x/sync/errgroup" - - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" ) var ( @@ -54,216 +48,49 @@ func main() { } } -func parseFlags() (metricsPort, mongoClientCertMountPath, kubeconfigPath *string, dryRun, circuitBreakerEnabled *bool, - circuitBreakerPercentage *int, circuitBreakerDuration *time.Duration) { - metricsPort = flag.String("metrics-port", "2112", "port to expose Prometheus metrics on") - - mongoClientCertMountPath = flag.String("mongo-client-cert-mount-path", "/etc/ssl/mongo-client", - "path where the mongodb client cert is mounted") - - kubeconfigPath = flag.String("kubeconfig-path", "", "path to kubeconfig file") - - dryRun = flag.Bool("dry-run", false, "flag to run node drainer module in dry-run mode") - - circuitBreakerPercentage = flag.Int("circuit-breaker-percentage", - 50, "percentage of nodes to cordon before tripping the circuit breaker") - - circuitBreakerDuration = flag.Duration("circuit-breaker-duration", - 5*time.Minute, "duration of the circuit breaker window") - - circuitBreakerEnabled = flag.Bool("circuit-breaker-enabled", true, - "enable or disable fault quarantine circuit breaker") - - flag.Parse() - - return -} - -func loadEnvConfig() (namespace, mongoURI, mongoDatabase, mongoCollection, tokenDatabase, tokenCollection string, - err error) { - namespace = os.Getenv("POD_NAMESPACE") - if namespace == "" { - return "", "", "", "", "", "", fmt.Errorf("POD_NAMESPACE is not provided") - } - - mongoURI = os.Getenv("MONGODB_URI") - if mongoURI == "" { - return "", "", "", "", "", "", fmt.Errorf("MONGODB_URI is not provided") - } - - mongoDatabase = os.Getenv("MONGODB_DATABASE_NAME") - if mongoDatabase == "" { - return "", "", "", "", "", "", fmt.Errorf("MONGODB_DATABASE_NAME is not provided") - } - - mongoCollection = os.Getenv("MONGODB_COLLECTION_NAME") - if mongoCollection == "" { - return "", "", "", "", "", "", fmt.Errorf("MONGODB_COLLECTION_NAME is not provided") - } - - tokenDatabase = os.Getenv("MONGODB_DATABASE_NAME") - if tokenDatabase == "" { - return "", "", "", "", "", "", fmt.Errorf("MONGODB_DATABASE_NAME is not provided") - } - - tokenCollection = os.Getenv("MONGODB_TOKEN_COLLECTION_NAME") - if tokenCollection == "" { - return "", "", "", "", "", "", fmt.Errorf("MongoDB token collection name is not provided") - } - - return namespace, mongoURI, mongoDatabase, mongoCollection, tokenDatabase, tokenCollection, nil -} - -func loadMongoTimeouts() (totalTimeoutSeconds, intervalSeconds, totalCACertTimeoutSeconds, - intervalCACertSeconds, unprocessedEventsMetricUpdateIntervalSeconds int, err error) { - totalTimeoutSeconds, err = getEnvAsInt("MONGODB_PING_TIMEOUT_TOTAL_SECONDS", 300) - if err != nil { - return 0, 0, 0, 0, 0, fmt.Errorf("invalid MONGODB_PING_TIMEOUT_TOTAL_SECONDS: %w", err) - } - - intervalSeconds, err = getEnvAsInt("MONGODB_PING_INTERVAL_SECONDS", 5) - if err != nil { - return 0, 0, 0, 0, 0, fmt.Errorf("invalid MONGODB_PING_INTERVAL_SECONDS: %w", err) - } - - totalCACertTimeoutSeconds, err = getEnvAsInt("CA_CERT_MOUNT_TIMEOUT_TOTAL_SECONDS", 360) - if err != nil { - return 0, 0, 0, 0, 0, fmt.Errorf("invalid CA_CERT_MOUNT_TIMEOUT_TOTAL_SECONDS: %w", err) - } - - intervalCACertSeconds, err = getEnvAsInt("CA_CERT_READ_INTERVAL_SECONDS", 5) - if err != nil { - return 0, 0, 0, 0, 0, fmt.Errorf("invalid CA_CERT_READ_INTERVAL_SECONDS: %w", err) - } - - unprocessedEventsMetricUpdateIntervalSeconds, err = - getEnvAsInt("UNPROCESSED_EVENTS_METRIC_UPDATE_INTERVAL_SECONDS", 25) - if err != nil { - return 0, 0, 0, 0, 0, fmt.Errorf("invalid UNPROCESSED_EVENTS_METRIC_UPDATE_INTERVAL_SECONDS: %w", err) - } - - return -} - -func createMongoConfig( - mongoURI, mongoDatabase, mongoCollection, mongoClientCertMountPath string, - totalTimeoutSeconds, intervalSeconds, totalCACertTimeoutSeconds, intervalCACertSeconds int, -) storewatcher.MongoDBConfig { - return storewatcher.MongoDBConfig{ - URI: mongoURI, - Database: mongoDatabase, - Collection: mongoCollection, - ClientTLSCertConfig: storewatcher.MongoDBClientTLSCertConfig{ - TlsCertPath: filepath.Join(mongoClientCertMountPath, "tls.crt"), - TlsKeyPath: filepath.Join(mongoClientCertMountPath, "tls.key"), - CaCertPath: filepath.Join(mongoClientCertMountPath, "ca.crt"), - }, - TotalPingTimeoutSeconds: totalTimeoutSeconds, - TotalPingIntervalSeconds: intervalSeconds, - TotalCACertTimeoutSeconds: totalCACertTimeoutSeconds, - TotalCACertIntervalSeconds: intervalCACertSeconds, - } -} - -func createTokenConfig(tokenDatabase, tokenCollection string) storewatcher.TokenConfig { - return storewatcher.TokenConfig{ - ClientName: "fault-quarantine-module", - TokenDatabase: tokenDatabase, - TokenCollection: tokenCollection, - } -} - -func createPipeline() mongo.Pipeline { - return mongo.Pipeline{ - {{Key: "$match", Value: bson.D{{Key: "operationType", Value: bson.D{{Key: "$in", Value: bson.A{"insert"}}}}}}}, - } -} - func run() error { - // Create a context that gets cancelled on OS interrupt signals - ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) - defer stop() // Ensure the signal listener is cleaned up - metricsPort, mongoClientCertMountPath, kubeconfigPath, dryRun, circuitBreakerEnabled, - circuitBreakerPercentage, circuitBreakerDuration := parseFlags() + circuitBreakerPercentage, circuitBreakerDuration, tomlConfigPath := parseFlags() - namespace, mongoURI, mongoDatabase, mongoCollection, tokenDatabase, tokenCollection, err := loadEnvConfig() - if err != nil { - return err - } + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() - totalTimeoutSeconds, intervalSeconds, totalCACertTimeoutSeconds, - intervalCACertSeconds, unprocessedEventsMetricUpdateIntervalSeconds, err := loadMongoTimeouts() + portInt, err := strconv.Atoi(*metricsPort) if err != nil { - return err + return fmt.Errorf("invalid metrics port: %w", err) } - mongoConfig := createMongoConfig(mongoURI, mongoDatabase, mongoCollection, *mongoClientCertMountPath, - totalTimeoutSeconds, intervalSeconds, totalCACertTimeoutSeconds, intervalCACertSeconds) - - tokenConfig := createTokenConfig(tokenDatabase, tokenCollection) - - pipeline := createPipeline() - - tomlCfg, err := config.LoadTomlConfig("/etc/config/config.toml") - if err != nil { - return fmt.Errorf("error loading TOML config: %w", err) - } + srv := server.NewServer( + server.WithPort(portInt), + server.WithPrometheusMetrics(), + server.WithSimpleHealth(), + ) - if *dryRun { - slog.Info("Running in dry-run mode") + params := initializer.InitializationParams{ + MongoClientCertMountPath: *mongoClientCertMountPath, + KubeconfigPath: *kubeconfigPath, + TomlConfigPath: *tomlConfigPath, + DryRun: *dryRun, + CircuitBreakerPercentage: *circuitBreakerPercentage, + CircuitBreakerDuration: *circuitBreakerDuration, + CircuitBreakerEnabled: *circuitBreakerEnabled, } - // Initialize the k8s client - k8sClient, err := reconciler.NewFaultQuarantineClient(*kubeconfigPath, *dryRun) + components, err := initializer.InitializeAll(ctx, params) if err != nil { - return fmt.Errorf("error while initializing kubernetes client: %w", err) - } - - slog.Info("Successfully initialized k8sclient") - - reconcilerCfg := reconciler.ReconcilerConfig{ - TomlConfig: *tomlCfg, - MongoHealthEventCollectionConfig: mongoConfig, - TokenConfig: tokenConfig, - MongoPipeline: pipeline, - K8sClient: k8sClient, - DryRun: *dryRun, - CircuitBreakerEnabled: *circuitBreakerEnabled, - UnprocessedEventsMetricUpdateInterval: time.Duration(unprocessedEventsMetricUpdateIntervalSeconds) * - time.Second, - CircuitBreaker: reconciler.CircuitBreakerConfig{ - Namespace: namespace, - Name: "fault-quarantine-circuit-breaker", - Percentage: *circuitBreakerPercentage, - Duration: *circuitBreakerDuration, - }, + return fmt.Errorf("initialization failed: %w", err) } - // Create the work signal channel (buffered channel acting as semaphore) - workSignal := make(chan struct{}, 1) // Buffer size 1 is usually sufficient + slog.Info("Starting node informer") - // Pass the workSignal channel to the Reconciler - rec := reconciler.NewReconciler(ctx, reconcilerCfg, workSignal) - - // Parse the port - portInt, err := strconv.Atoi(*metricsPort) - if err != nil { - return fmt.Errorf("invalid metrics port: %w", err) + if err := components.K8sClient.NodeInformer.Run(ctx.Done()); err != nil { + return fmt.Errorf("failed to start node informer: %w", err) } - // Create the server - srv := server.NewServer( - server.WithPort(portInt), - server.WithPrometheusMetrics(), - server.WithSimpleHealth(), - ) + slog.Info("Node informer started and synced") - // Start server and reconciler concurrently g, gCtx := errgroup.WithContext(ctx) - // Start the metrics/health server. - // Metrics server failures are logged but do NOT terminate the service. g.Go(func() error { slog.Info("Starting metrics server", "port", portInt) @@ -275,27 +102,36 @@ func run() error { }) g.Go(func() error { - return rec.Start(gCtx) + return components.Reconciler.Start(gCtx) }) - // Wait for both goroutines to finish return g.Wait() } -func getEnvAsInt(name string, defaultValue int) (int, error) { - valueStr, exists := os.LookupEnv(name) - if !exists { - return defaultValue, nil - } +func parseFlags() (metricsPort, mongoClientCertMountPath, kubeconfigPath *string, dryRun, circuitBreakerEnabled *bool, + circuitBreakerPercentage *int, circuitBreakerDuration *time.Duration, tomlConfigPath *string) { + metricsPort = flag.String("metrics-port", "2112", "port to expose Prometheus metrics on") - value, err := strconv.Atoi(valueStr) - if err != nil { - return 0, fmt.Errorf("error converting %s to integer: %w", name, err) - } + mongoClientCertMountPath = flag.String("mongo-client-cert-mount-path", "/etc/ssl/mongo-client", + "path where the mongodb client cert is mounted") - if value <= 0 { - return 0, fmt.Errorf("value of %s must be a positive integer", name) - } + kubeconfigPath = flag.String("kubeconfig-path", "", "path to kubeconfig file") - return value, nil + tomlConfigPath = flag.String("config-path", "/etc/config/config.toml", + "path where the fault quarantine config file is present") + + dryRun = flag.Bool("dry-run", false, "flag to run fault quarantine module in dry-run mode") + + circuitBreakerPercentage = flag.Int("circuit-breaker-percentage", + 50, "percentage of nodes to cordon before tripping the circuit breaker") + + circuitBreakerDuration = flag.Duration("circuit-breaker-duration", + 5*time.Minute, "duration of the circuit breaker window") + + circuitBreakerEnabled = flag.Bool("circuit-breaker-enabled", true, + "enable or disable fault quarantine circuit breaker") + + flag.Parse() + + return } diff --git a/fault-quarantine-module/pkg/breaker/breaker.go b/fault-quarantine-module/pkg/breaker/breaker.go index 80595591a..f6b1bfac5 100644 --- a/fault-quarantine-module/pkg/breaker/breaker.go +++ b/fault-quarantine-module/pkg/breaker/breaker.go @@ -22,11 +22,13 @@ package breaker import ( "context" + "errors" "fmt" "log/slog" "math" "time" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/metrics" "golang.org/x/exp/maps" ) @@ -34,6 +36,12 @@ const ( resultError = "error" ) +var ( + // ErrRetryExhausted signals that GetTotalNodes retry attempts were exhausted + // This error should trigger pod restart + ErrRetryExhausted = errors.New("circuit breaker: all retry attempts exhausted") +) + // NewSlidingWindowBreaker creates a new sliding window circuit breaker for fault quarantine. // It prevents cordoning more than a specified percentage of nodes within a time window. // The breaker uses a ring buffer with 1-second granularity to track unique cordoned nodes. @@ -54,14 +62,17 @@ func NewSlidingWindowBreaker(ctx context.Context, cfg Config) (CircuitBreaker, e b.indexToNodes[i] = make(map[string]bool) } - err := cfg.EnsureConfigMap(ctx, StateClosed) + err := cfg.K8sClient.EnsureCircuitBreakerConfigMap(ctx, cfg.ConfigMapName, cfg.ConfigMapNamespace, StateClosed) if err != nil { slog.Error("Error ensuring circuit breaker config map", "error", err) return nil, fmt.Errorf("error ensuring circuit breaker config map: %w", err) } - if s, err := cfg.ReadStateFn(ctx); err == nil && (s == StateClosed || s == StateTripped) { - b.state = s + state, err := cfg.K8sClient.ReadCircuitBreakerState(ctx, cfg.ConfigMapName, cfg.ConfigMapNamespace) + if err == nil { + if state == StateClosed || state == StateTripped { + b.state = state + } } return b, nil @@ -197,14 +208,14 @@ func (b *slidingWindowBreaker) IsTripped(ctx context.Context) (bool, error) { recentCordonedNodes := b.sumBuckets() threshold := int(math.Ceil(float64(totalNodes) * b.cfg.TripPercentage / 100)) shouldTrip := recentCordonedNodes >= threshold + b.mu.Unlock() slog.Debug("Recent cordoned nodes status", "recentCordonedNodes", recentCordonedNodes, "totalNodes", totalNodes, "tripPercentage", b.cfg.TripPercentage) - SetFaultQuarantineBreakerUtilization(float64(recentCordonedNodes) / float64(totalNodes)) - b.mu.Unlock() + metrics.SetFaultQuarantineBreakerUtilization(float64(recentCordonedNodes) / float64(totalNodes)) if shouldTrip { err := b.ForceState(ctx, StateTripped) @@ -213,12 +224,12 @@ func (b *slidingWindowBreaker) IsTripped(ctx context.Context) (bool, error) { return true, fmt.Errorf("error forcing circuit breaker state to TRIPPED: %w", err) } - SetFaultQuarantineBreakerState(StateTripped) + metrics.SetFaultQuarantineBreakerState(string(StateTripped)) return true, nil } - SetFaultQuarantineBreakerState(StateClosed) + metrics.SetFaultQuarantineBreakerState(string(StateClosed)) return false, nil } @@ -231,9 +242,11 @@ func (b *slidingWindowBreaker) ForceState(ctx context.Context, s State) error { b.state = s b.mu.Unlock() - if err := b.cfg.WriteStateFn(ctx, s); err != nil { + err := b.cfg.K8sClient.WriteCircuitBreakerState( + ctx, b.cfg.ConfigMapName, b.cfg.ConfigMapNamespace, s) + if err != nil { slog.Error("Error writing circuit breaker state", "error", err) - return fmt.Errorf("failed to write circuit breaker state %s: %w", s, err) + return fmt.Errorf("error writing circuit breaker state: %w", err) } slog.Info("ForceState changed", "state", s) @@ -261,19 +274,17 @@ func (b *slidingWindowBreaker) getTotalNodesWithRetry(ctx context.Context) (int, defer func() { duration := time.Since(startTime).Seconds() - faultQuarantineGetTotalNodesDuration.WithLabelValues(result).Observe(duration) + metrics.FaultQuarantineGetTotalNodesDuration.WithLabelValues(result).Observe(duration) if errorType != "" { - faultQuarantineGetTotalNodesErrors.WithLabelValues(errorType).Inc() + metrics.FaultQuarantineGetTotalNodesErrors.WithLabelValues(errorType).Inc() } }() maxRetries, initialDelay, maxDelay := b.getRetryConfig() - var lastErr error - for attempt := 0; attempt <= maxRetries; attempt++ { - totalNodes, err := b.cfg.GetTotalNodes(ctx) + totalNodes, err := b.cfg.K8sClient.GetTotalNodes(ctx) if err != nil { result = resultError @@ -285,14 +296,15 @@ func (b *slidingWindowBreaker) getTotalNodesWithRetry(ctx context.Context) (int, if totalNodes > 0 { result = "success" - faultQuarantineGetTotalNodesRetryAttempts.Observe(float64(attempt)) + metrics.FaultQuarantineGetTotalNodesRetryAttempts.Observe(float64(attempt)) return b.handleSuccessfulNodeCount(totalNodes, attempt) } - // Store error for final return (only last value is used) - //nolint:staticcheck // SA4006: intermediate values overwritten, only final used - lastErr = b.handleZeroNodes(attempt, maxRetries) + if attempt == 0 { + slog.Info("Circuit breaker starting retries: NodeInformer cache may not be synced yet", + "maxRetries", maxRetries) + } if attempt < maxRetries { if err := b.performRetryDelay(ctx, attempt, maxRetries, initialDelay, maxDelay); err != nil { @@ -305,14 +317,10 @@ func (b *slidingWindowBreaker) getTotalNodesWithRetry(ctx context.Context) (int, } // All retries exhausted - if err := b.logRetriesExhausted(maxRetries, initialDelay, maxDelay); err != nil { - return 0, fmt.Errorf("error logging retries exhausted: %w", err) - } - result = resultError errorType = "zero_nodes" - return 0, lastErr + return 0, b.logRetriesExhausted(ctx, maxRetries, initialDelay, maxDelay) } // getRetryConfig extracts and validates retry configuration with defaults @@ -356,18 +364,6 @@ func (b *slidingWindowBreaker) handleSuccessfulNodeCount(totalNodes, attempt int return totalNodes, nil } -// handleZeroNodes handles the case when GetTotalNodes returns 0 -func (b *slidingWindowBreaker) handleZeroNodes(attempt, maxRetries int) error { - lastErr := fmt.Errorf("GetTotalNodes returned 0 nodes (likely NodeInformer cache not synced yet)") - - if attempt == 0 { - slog.Info("Circuit breaker starting retries: NodeInformer cache may not be synced yet", - "maxRetries", maxRetries) - } - - return lastErr -} - // performRetryDelay calculates and performs the exponential backoff delay func (b *slidingWindowBreaker) performRetryDelay(ctx context.Context, attempt, maxRetries int, initialDelay, maxDelay time.Duration) error { @@ -406,13 +402,11 @@ func (b *slidingWindowBreaker) calculateBackoffDelay(attempt int, return delay } -// logRetriesExhausted logs a summary when all retries are exhausted and crashes the pod. -// It attempts to get the actual node count for accurate error context. -// Returns an error if unable to get the node count. -func (b *slidingWindowBreaker) logRetriesExhausted(maxRetries int, initialDelay, maxDelay time.Duration) error { - // Get the actual node count from the last attempt to provide accurate error context - ctx := context.Background() - actualNodes, err := b.cfg.GetTotalNodes(ctx) +// logRetriesExhausted logs a summary when all retries are exhausted. +// Returns ErrRetryExhausted wrapped with context for pod restart. +func (b *slidingWindowBreaker) logRetriesExhausted(ctx context.Context, maxRetries int, + initialDelay, maxDelay time.Duration) error { + actualNodes, err := b.cfg.K8sClient.GetTotalNodes(ctx) if err != nil { slog.Error( @@ -423,8 +417,17 @@ func (b *slidingWindowBreaker) logRetriesExhausted(maxRetries int, initialDelay, "totalClusterNodes", actualNodes, "maxDelay", maxDelay) - return nil + return fmt.Errorf("%w: failed to get node count: %w", ErrRetryExhausted, err) } - return fmt.Errorf("error getting total nodes after %d retries", maxRetries) + slog.Error("Circuit breaker: All retry attempts exhausted", + "maxRetries", maxRetries, + "actualNodes", actualNodes, + "initialDelay", initialDelay, + "maxDelay", maxDelay, + "message", + "Found total nodes but GetTotalNodes still returning 0. NodeInformer cache sync issues. Pod will restart.") + + return fmt.Errorf("%w: NodeInformer cache sync failed after %d retries (actualNodes=%d but GetTotalNodes returning 0)", + ErrRetryExhausted, maxRetries, actualNodes) } diff --git a/fault-quarantine-module/pkg/breaker/breaker_test.go b/fault-quarantine-module/pkg/breaker/breaker_test.go index b78f2dbbf..ed7ea76e6 100644 --- a/fault-quarantine-module/pkg/breaker/breaker_test.go +++ b/fault-quarantine-module/pkg/breaker/breaker_test.go @@ -17,175 +17,364 @@ package breaker import ( "context" "fmt" + "log" + "os" "testing" "time" + + "github.com/stretchr/testify/require" + "go.mongodb.org/mongo-driver/bson/primitive" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/envtest" +) + +var ( + testClient *kubernetes.Clientset + testEnv *envtest.Environment ) -func newTestBreaker(t *testing.T, totalNodes int, tripPercentage float64, window time.Duration, opts ...func(*Config)) CircuitBreaker { +func TestMain(m *testing.M) { + var err error + + testEnv = &envtest.Environment{} + + testRestConfig, err := testEnv.Start() + if err != nil { + log.Fatalf("Failed to start test environment: %v", err) + } + + testClient, err = kubernetes.NewForConfig(testRestConfig) + if err != nil { + log.Fatalf("Failed to create kubernetes client: %v", err) + } + + exitCode := m.Run() + + if err := testEnv.Stop(); err != nil { + log.Fatalf("Failed to stop test environment: %v", err) + } + os.Exit(exitCode) +} + +type testK8sClient struct { + clientset kubernetes.Interface + informer cache.SharedIndexInformer + informerSynced cache.InformerSynced +} + +func (c *testK8sClient) GetTotalNodes(ctx context.Context) (int, error) { + if !c.informerSynced() { + return 0, fmt.Errorf("node informer cache not synced yet") + } + + allObjs := c.informer.GetIndexer().List() + return len(allObjs), nil +} + +func (c *testK8sClient) EnsureCircuitBreakerConfigMap(ctx context.Context, name, namespace string, initialStatus State) error { + cmClient := c.clientset.CoreV1().ConfigMaps(namespace) + + _, err := cmClient.Get(ctx, name, metav1.GetOptions{}) + if err == nil { + return nil + } + + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to get config map %s in namespace %s: %w", name, namespace, err) + } + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, + Data: map[string]string{"status": string(initialStatus)}, + } + + _, err = cmClient.Create(ctx, cm, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create config map %s in namespace %s: %w", name, namespace, err) + } + + return nil +} + +func (c *testK8sClient) ReadCircuitBreakerState(ctx context.Context, name, namespace string) (State, error) { + cm, err := c.clientset.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return "", fmt.Errorf("failed to get config map %s in namespace %s: %w", name, namespace, err) + } + + if cm.Data == nil { + return "", nil + } + + return State(cm.Data["status"]), nil +} + +func (c *testK8sClient) WriteCircuitBreakerState(ctx context.Context, name, namespace string, state State) error { + cm, err := c.clientset.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + + if cm.Data == nil { + cm.Data = map[string]string{} + } + + cm.Data["status"] = string(state) + + _, err = c.clientset.CoreV1().ConfigMaps(namespace).Update(ctx, cm, metav1.UpdateOptions{}) + return err +} + +func createTestNode(ctx context.Context, t *testing.T, name string) { t.Helper() - ctx := context.Background() - callsEnsure := 0 - callsWrite := 0 - cfg := Config{ - Window: window, - TripPercentage: tripPercentage, - GetTotalNodes: func(context.Context) (int, error) { - return totalNodes, nil + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, }, - EnsureConfigMap: func(context.Context, State) error { - callsEnsure++ - return nil + Spec: corev1.NodeSpec{}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }, }, - ReadStateFn: func(context.Context) (State, error) { return StateClosed, nil }, - WriteStateFn: func(context.Context, State) error { callsWrite++; return nil }, } - for _, opt := range opts { - opt(&cfg) + + _, err := testClient.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test node %s: %v", name, err) } +} + +func setupTestClient(t *testing.T) *testK8sClient { + t.Helper() + + informerFactory := informers.NewSharedInformerFactory(testClient, 0) + nodeInformerObj := informerFactory.Core().V1().Nodes() + + client := &testK8sClient{ + clientset: testClient, + informer: nodeInformerObj.Informer(), + informerSynced: nodeInformerObj.Informer().HasSynced, + } + + stopCh := make(chan struct{}) + t.Cleanup(func() { close(stopCh) }) + + go client.informer.Run(stopCh) + + if ok := cache.WaitForCacheSync(stopCh, client.informerSynced); !ok { + t.Fatalf("Failed to sync node informer cache") + } + + return client +} + +func newTestBreaker(t *testing.T, ctx context.Context, totalNodes int, tripPercentage float64, window time.Duration, initialState State) CircuitBreaker { + t.Helper() + + k8sClient := setupTestClient(t) + + // Create the specified number of nodes + nodeNames := make([]string, totalNodes) + for i := 0; i < totalNodes; i++ { + nodeName := fmt.Sprintf("test-node-%d-%s", i, primitive.NewObjectID().Hex()[:6]) + nodeNames[i] = nodeName + createTestNode(ctx, t, nodeName) + } + + t.Cleanup(func() { + for _, nodeName := range nodeNames { + _ = testClient.CoreV1().Nodes().Delete(context.Background(), nodeName, metav1.DeleteOptions{}) + } + }) + + // Wait for nodes to be visible in informer cache + require.Eventually(t, func() bool { + actualNodes, err := k8sClient.GetTotalNodes(ctx) + return err == nil && actualNodes == totalNodes + }, 5*time.Second, 50*time.Millisecond, "NodeInformer should see all %d nodes", totalNodes) + + configMapName := "test-breaker-" + primitive.NewObjectID().Hex()[:8] + + // If initialState is provided, create ConfigMap with that state + if initialState != "" { + err := k8sClient.EnsureCircuitBreakerConfigMap(ctx, configMapName, "default", initialState) + if err != nil { + t.Fatalf("failed to ensure circuit breaker ConfigMap: %v", err) + } + } + + cfg := Config{ + Window: window, + TripPercentage: tripPercentage, + K8sClient: k8sClient, + ConfigMapName: configMapName, + ConfigMapNamespace: "default", + } + b, err := NewSlidingWindowBreaker(ctx, cfg) if err != nil { t.Fatalf("failed to create breaker: %v", err) } - // Ensure constructor hook was called once - if callsEnsure != 1 { - t.Fatalf("expected EnsureConfigMap to be called once, got %d", callsEnsure) - } - // Use callsWrite to avoid unused warning via ForceState in tests - _ = callsWrite + + t.Cleanup(func() { + _ = testClient.CoreV1().ConfigMaps("default").Delete(context.Background(), configMapName, metav1.DeleteOptions{}) + }) + return b } func TestDoesNotTripBelowThreshold(t *testing.T) { - b := newTestBreaker(t, 10, 50, 1*time.Second) ctx := context.Background() + b := newTestBreaker(t, ctx, 10, 50, 1*time.Second, "") - // threshold = int(10*0.5) = 5, must exceed to trip - for i := 0; i < 5; i++ { + t.Log("Adding 4 cordon events (below threshold of 5)") + for i := 0; i < 4; i++ { b.AddCordonEvent(fmt.Sprintf("node%d", i)) } - if _, err := b.IsTripped(ctx); err != nil { + tripped, err := b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) } + if tripped { + t.Fatalf("breaker should not trip below threshold (4 < 5)") + } } func TestTripsWhenAboveThreshold(t *testing.T) { - b := newTestBreaker(t, 10, 50, 1*time.Second) ctx := context.Background() + b := newTestBreaker(t, ctx, 10, 50, 1*time.Second, "") - for i := 0; i < 6; i++ { // exceed threshold 5 + t.Log("Adding 5 cordon events (at threshold, should trip)") + for i := 0; i < 5; i++ { b.AddCordonEvent(fmt.Sprintf("node%d", i)) } - if tripped, err := b.IsTripped(ctx); err != nil { + tripped, err := b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if !tripped { - t.Fatalf("breaker should trip when above threshold") + } + if !tripped { + t.Fatalf("breaker should trip at threshold (5 >= 5)") } } func TestForceStateOverridesComputation(t *testing.T) { - b := newTestBreaker(t, 10, 50, 1*time.Second) ctx := context.Background() + b := newTestBreaker(t, ctx, 10, 50, 1*time.Second, "") - if err := b.ForceState(ctx, StateTripped); err != nil { + t.Log("Force state to TRIPPED") + err := b.ForceState(ctx, StateTripped) + if err != nil { t.Fatalf("force trip failed: %v", err) } - if tripped, err := b.IsTripped(ctx); err != nil { + tripped, err := b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if !tripped { + } + if !tripped { t.Fatalf("breaker should report tripped after ForceState(StateTripped)") } - if err := b.ForceState(ctx, StateClosed); err != nil { + t.Log("Force state to CLOSED") + err = b.ForceState(ctx, StateClosed) + if err != nil { t.Fatalf("force close failed: %v", err) } - if tripped, err := b.IsTripped(ctx); err != nil { + tripped, err = b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if tripped { + } + if tripped { t.Fatalf("breaker should not be tripped after ForceState(StateClosed)") } } func TestWindowExpiryResetsCounts(t *testing.T) { - b := newTestBreaker(t, 10, 50, 1*time.Second) ctx := context.Background() + b := newTestBreaker(t, ctx, 10, 50, 1*time.Second, "") - for i := 0; i < 6; i++ { // exceed threshold + t.Log("Adding 6 cordon events (exceeds threshold)") + for i := 0; i < 6; i++ { b.AddCordonEvent(fmt.Sprintf("node%d", i)) } - if tripped, err := b.IsTripped(ctx); err != nil { + tripped, err := b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if !tripped { + } + if !tripped { t.Fatalf("breaker should trip when above threshold") } - // Close it again so we can test reset via window advance - if err := b.ForceState(ctx, StateClosed); err != nil { + t.Log("Force close to test window advance") + err = b.ForceState(ctx, StateClosed) + if err != nil { t.Fatalf("force close failed: %v", err) } - // Wait for window to roll over; buckets are 1s granularity + t.Log("Wait for window to roll over (1s granularity)") time.Sleep(1100 * time.Millisecond) - if tripped, err := b.IsTripped(ctx); err != nil { + tripped, err = b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if tripped { + } + if tripped { t.Fatalf("breaker should not trip after window expiry with no new events") } } func TestInitializeFromReadState(t *testing.T) { ctx := context.Background() - // Start with TRIPPED in persisted state - b, err := NewSlidingWindowBreaker(ctx, Config{ - Window: 1 * time.Second, - TripPercentage: 50, - GetTotalNodes: func(context.Context) (int, error) { - return 10, nil - }, - EnsureConfigMap: func(context.Context, State) error { return nil }, - ReadStateFn: func(context.Context) (State, error) { return StateTripped, nil }, - WriteStateFn: func(context.Context, State) error { return nil }, - }) + b := newTestBreaker(t, ctx, 10, 50, 1*time.Second, StateTripped) + + t.Log("Verify breaker initialized with TRIPPED state") + tripped, err := b.IsTripped(ctx) if err != nil { - t.Fatalf("failed to create breaker: %v", err) - } - if tripped, err := b.IsTripped(ctx); err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if !tripped { - t.Fatalf("breaker should trip when above threshold") + } + if !tripped { + t.Fatalf("breaker should be tripped when initialized with TRIPPED state") } if got := b.CurrentState(); got != StateTripped { t.Fatalf("expected initial state TRIPPED, got %s", got) - } else if got != StateTripped { - t.Fatalf("expected initial state TRIPPED, got %s", got) } } func TestFlappingNodeDoesNotMultiplyCount(t *testing.T) { - b := newTestBreaker(t, 10, 50, 5*time.Second) // 5-second window, 50% threshold = 5 nodes ctx := context.Background() + b := newTestBreaker(t, ctx, 10, 50, 5*time.Second, "") - // Add the same node multiple times (simulating flapping) - for i := 0; i < 10; i++ { + t.Log("Add the same node 10 times (simulating flapping)") + for range 10 { b.AddCordonEvent("flapping-node") } - // Should not trip because it's only 1 unique node, not 6 - if tripped, err := b.IsTripped(ctx); err != nil { + t.Log("Verify breaker does not trip for single flapping node") + tripped, err := b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if tripped { - t.Fatalf("breaker should not trip for single flapping node") + } + if tripped { + t.Fatalf("breaker should not trip for single flapping node (1 < 5)") } - // Add 5 more unique nodes (total 6 unique nodes) - for i := 0; i < 5; i++ { + t.Log("Add 4 more unique nodes (total 5 unique nodes)") + for i := 0; i < 4; i++ { b.AddCordonEvent(fmt.Sprintf("node%d", i)) } - // Now should trip because we have 6 unique nodes - if tripped, err := b.IsTripped(ctx); err != nil { + // Now should trip because we have 5 unique nodes (at threshold, >= 5) + tripped, err = b.IsTripped(ctx) + if err != nil { t.Fatalf("error checking if breaker should trip: %v", err) - } else if !tripped { - t.Fatalf("breaker should trip with 6 unique nodes (exceeds 5 threshold)") + } + if !tripped { + t.Fatalf("breaker should trip with 5 unique nodes (5 >= 5 threshold)") } } diff --git a/fault-quarantine-module/pkg/breaker/metrics.go b/fault-quarantine-module/pkg/breaker/metrics.go deleted file mode 100644 index 0be2afc5d..000000000 --- a/fault-quarantine-module/pkg/breaker/metrics.go +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package breaker - -import ( - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -var ( - faultQuarantineBreakerState = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "fault_quarantine_breaker_state", - Help: "State of the fault quarantine breaker.", - }, - []string{"state"}, - ) - faultQuarantineBreakerUtilization = promauto.NewGauge( - prometheus.GaugeOpts{ - Name: "fault_quarantine_breaker_utilization", - Help: "Utilization of the fault quarantine breaker.", - }, - ) - faultQuarantineGetTotalNodesDuration = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "fault_quarantine_get_total_nodes_duration_seconds", - Help: "Duration of getTotalNodesWithRetry calls in seconds.", - Buckets: prometheus.DefBuckets, - }, - []string{"result"}, - ) - faultQuarantineGetTotalNodesErrors = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "fault_quarantine_get_total_nodes_errors_total", - Help: "Total number of errors from getTotalNodesWithRetry.", - }, - []string{"error_type"}, - ) - faultQuarantineGetTotalNodesRetryAttempts = promauto.NewHistogram( - prometheus.HistogramOpts{ - Name: "fault_quarantine_get_total_nodes_retry_attempts", - Help: "Number of retry attempts needed for getTotalNodesWithRetry.", - Buckets: []float64{0, 1, 2, 3, 5, 10}, - }, - ) -) - -func SetFaultQuarantineBreakerUtilization(utilization float64) { - faultQuarantineBreakerUtilization.Set(utilization) -} - -func SetFaultQuarantineBreakerState(state State) { - faultQuarantineBreakerState.Reset() - faultQuarantineBreakerState.WithLabelValues(string(state)).Set(1) -} diff --git a/fault-quarantine-module/pkg/breaker/types.go b/fault-quarantine-module/pkg/breaker/types.go index abc7a4d30..daf28154f 100644 --- a/fault-quarantine-module/pkg/breaker/types.go +++ b/fault-quarantine-module/pkg/breaker/types.go @@ -21,6 +21,22 @@ import ( "time" ) +// K8sClientOperations defines the minimal interface needed by the circuit breaker +type K8sClientOperations interface { + GetTotalNodes(ctx context.Context) (int, error) + EnsureCircuitBreakerConfigMap(ctx context.Context, name, namespace string, initialStatus State) error + ReadCircuitBreakerState(ctx context.Context, name, namespace string) (State, error) + WriteCircuitBreakerState(ctx context.Context, name, namespace string, status State) error +} + +// CircuitBreakerConfig holds the Kubernetes-specific configuration for the circuit breaker +type CircuitBreakerConfig struct { + Namespace string + Name string + Percentage int + Duration time.Duration +} + // State represents the current state of the circuit breaker type State string @@ -43,7 +59,7 @@ type CircuitBreaker interface { } // Config holds the configuration parameters for the sliding window circuit breaker. -// It defines the time window, trip threshold, and optional persistence hooks. +// It defines the time window, trip threshold, and K8s client for state persistence. type Config struct { // Window defines the sliding time window over which cordon events are counted. // Default: 5 minutes. Events older than this window are automatically discarded. @@ -54,10 +70,14 @@ type Config struct { // Default: 50 (50% of nodes). TripPercentage float64 - // GetTotalNodes returns the current total number of nodes in the cluster. - // Used to compute the dynamic trip threshold as total_nodes * TripPercentage. - // This allows the threshold to adapt to cluster scaling events. - GetTotalNodes func(ctx context.Context) (int, error) + // K8sClient provides operations for node counts and ConfigMap state persistence + K8sClient K8sClientOperations + + // ConfigMapName is the name of the ConfigMap used for state persistence + ConfigMapName string + + // ConfigMapNamespace is the namespace of the ConfigMap + ConfigMapNamespace string // MaxRetries is the maximum number of retry attempts when GetTotalNodes returns 0 // Default: 10 retries (allows ~30 seconds for cache sync with exponential backoff) @@ -70,13 +90,6 @@ type Config struct { // MaxRetryDelay caps the maximum delay between retry attempts // Default: 5 seconds (prevents excessive delays) MaxRetryDelay time.Duration - - // EnsureConfigMap creates/initializes the ConfigMap for state persistence - EnsureConfigMap func(ctx context.Context, initial State) error - // ReadStateFn reads the persisted breaker state from ConfigMap - ReadStateFn func(ctx context.Context) (State, error) - // WriteStateFn persists the current breaker state to ConfigMap - WriteStateFn func(ctx context.Context, s State) error } // slidingWindowBreaker implements CircuitBreaker using a ring buffer approach. diff --git a/fault-quarantine-module/pkg/common/common.go b/fault-quarantine-module/pkg/common/common.go index 33a65941c..a89ba7289 100644 --- a/fault-quarantine-module/pkg/common/common.go +++ b/fault-quarantine-module/pkg/common/common.go @@ -20,9 +20,6 @@ type RuleEvaluationResult int const ( RuleEvaluationSuccess RuleEvaluationResult = iota RuleEvaluationFailed - RuleEvaluationErroredOut - RuleEvaluationNotApplicable - RuleEvaluationRetryAgainInFuture ) const ( diff --git a/fault-quarantine-module/pkg/common/health_events_buffer.go b/fault-quarantine-module/pkg/common/health_events_buffer.go deleted file mode 100644 index e93154c61..000000000 --- a/fault-quarantine-module/pkg/common/health_events_buffer.go +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package common - -import ( - "context" - "fmt" - "log/slog" - - "github.com/nvidia/nvsentinel/data-models/pkg/model" - - "go.mongodb.org/mongo-driver/bson" -) - -// healthEventInfo represents information about a health event -type HealthEventInfo struct { - HealthEventWithStatus *model.HealthEventWithStatus - EventBson bson.M - HasProcessed bool -} - -// HealthEventBuffer represents a buffer of health events using an array -type HealthEventBuffer struct { - events []HealthEventInfo - ctx context.Context -} - -// NewHealthEventBuffer creates a new health event buffer. -func NewHealthEventBuffer(ctx context.Context) *HealthEventBuffer { - return &HealthEventBuffer{ - events: make([]HealthEventInfo, 0), - ctx: ctx, - } -} - -// Add adds a health event at the end of the buffer. -func (b *HealthEventBuffer) Add(event *model.HealthEventWithStatus, eventBson bson.M) { - b.events = append(b.events, HealthEventInfo{ - HealthEventWithStatus: event, - EventBson: eventBson, - HasProcessed: false, - }) -} - -// RemoveAt removes an element at the specified index. -func (b *HealthEventBuffer) RemoveAt(index int) error { - if index < 0 || index >= len(b.events) { - return fmt.Errorf("index out of bounds: %d", index) - } - - slog.Debug("Removing event at index", - "index", index, - "event", b.events[index].HealthEventWithStatus, - ) - - // Remove the element at index - b.events = append(b.events[:index], b.events[index+1:]...) - - return nil -} - -// Length returns the current number of elements in the buffer. -func (b *HealthEventBuffer) Length() int { - return len(b.events) -} - -// Get returns the element at the specified index without removing it. -func (b *HealthEventBuffer) Get(index int) (*HealthEventInfo, error) { - if index < 0 || index >= len(b.events) { - return nil, fmt.Errorf("index out of bounds or buffer is empty") // Index out of bounds or buffer is empty - } - - return &b.events[index], nil -} diff --git a/fault-quarantine-module/pkg/config/config.go b/fault-quarantine-module/pkg/config/config.go index 16273f4bb..80056c282 100644 --- a/fault-quarantine-module/pkg/config/config.go +++ b/fault-quarantine-module/pkg/config/config.go @@ -14,10 +14,6 @@ package config -import ( - "github.com/BurntSushi/toml" -) - type Rule struct { Kind string `toml:"kind"` Expression string `toml:"expression"` @@ -51,12 +47,3 @@ type TomlConfig struct { LabelPrefix string `toml:"label-prefix"` RuleSets []RuleSet `toml:"rule-sets"` } - -func LoadTomlConfig(path string) (*TomlConfig, error) { - var config TomlConfig - if _, err := toml.DecodeFile(path, &config); err != nil { - return nil, err - } - - return &config, nil -} diff --git a/fault-quarantine-module/pkg/evaluator/rule_evaluator.go b/fault-quarantine-module/pkg/evaluator/rule_evaluator.go index ee26cb48d..cc95da586 100644 --- a/fault-quarantine-module/pkg/evaluator/rule_evaluator.go +++ b/fault-quarantine-module/pkg/evaluator/rule_evaluator.go @@ -89,19 +89,19 @@ func (he *HealthEventRuleEvaluator) Evaluate( event *protos.HealthEvent) (common.RuleEvaluationResult, error) { obj, err := RoundTrip(event) if err != nil { - return common.RuleEvaluationErroredOut, fmt.Errorf("error roundtripping event: %w", err) + return common.RuleEvaluationFailed, fmt.Errorf("error roundtripping event: %w", err) } out, _, err := he.program.Eval(map[string]interface{}{ eventObjKey: obj, }) if err != nil { - return common.RuleEvaluationErroredOut, fmt.Errorf("failed to evaluate expression: %w", err) + return common.RuleEvaluationFailed, fmt.Errorf("failed to evaluate expression: %w", err) } result, ok := out.Value().(bool) if !ok { - return common.RuleEvaluationErroredOut, fmt.Errorf("expression did not return a boolean: %v", out) + return common.RuleEvaluationFailed, fmt.Errorf("expression did not return a boolean: %v", out) } if result { @@ -152,21 +152,19 @@ func NewNodeRuleEvaluator(expression string, nodeLister corelisters.NodeLister) func (nm *NodeRuleEvaluator) Evaluate(event *protos.HealthEvent) (common.RuleEvaluationResult, error) { slog.Info("Evaluating NodeRuleEvaluator for node", "node", event.NodeName) - // Get node metadata nodeInfo, err := nm.getNode(event.NodeName) if err != nil { - return common.RuleEvaluationErroredOut, fmt.Errorf("failed to get node metadata: %w", err) + return common.RuleEvaluationFailed, fmt.Errorf("failed to get node metadata: %w", err) } - // Evaluate the expression out, _, err := nm.program.Eval(nodeInfo) if err != nil { - return common.RuleEvaluationErroredOut, fmt.Errorf("failed to evaluate expression: %w", err) + return common.RuleEvaluationFailed, fmt.Errorf("failed to evaluate expression: %w", err) } result, ok := out.Value().(bool) if !ok { - return common.RuleEvaluationErroredOut, fmt.Errorf("expression did not return a boolean: %v", out) + return common.RuleEvaluationFailed, fmt.Errorf("expression did not return a boolean: %v", out) } if result { @@ -193,77 +191,57 @@ func (nm *NodeRuleEvaluator) getNode(nodeName string) (map[string]interface{}, e }, nil } +var primitiveKinds = map[reflect.Kind]bool{ + reflect.Bool: true, + reflect.Int: true, + reflect.Int8: true, + reflect.Int16: true, + reflect.Int32: true, + reflect.Int64: true, + reflect.Uint: true, + reflect.Uint8: true, + reflect.Uint16: true, + reflect.Uint32: true, + reflect.Uint64: true, + reflect.Uintptr: true, + reflect.Float32: true, + reflect.Float64: true, + reflect.Complex64: true, + reflect.Complex128: true, + reflect.String: true, +} + // recursively converts any Go value into a JSON-compatible structure // with all fields present. Structs become map[string]interface{}, slices become []interface{}, // maps become map[string]interface{}. Zero-values or nil pointers appear as null in the final map -// nolint: cyclop, gocognit //fix this as part of NGCC-21793 func structToInterface(v reflect.Value) interface{} { if !v.IsValid() { return nil } - switch v.Kind() { - case reflect.Ptr: - if v.IsNil() { - return nil - } + kind := v.Kind() - return structToInterface(v.Elem()) + if primitiveKinds[kind] { + return v.Interface() + } - case reflect.Struct: - result := make(map[string]interface{}) - typ := v.Type() - - for i := 0; i < typ.NumField(); i++ { - field := typ.Field(i) - // unexported - if field.PkgPath != "" { - continue - } - - jsonTag := field.Tag.Get("json") - if jsonTag == "" { - continue - } - - name := jsonTag - if idx := strings.Index(name, ","); idx != -1 { - name = name[:idx] - } - - if name == "" { - name = field.Name - } - - fieldVal := structToInterface(v.Field(i)) - result[name] = fieldVal - } + return handleComplexType(v, kind) +} - return result +func handleComplexType(v reflect.Value, kind reflect.Kind) interface{} { + switch kind { + case reflect.Ptr: + return handlePointer(v) + case reflect.Struct: + return handleStruct(v) case reflect.Slice, reflect.Array: - if v.IsNil() { - return nil - } - - sliceResult := make([]interface{}, v.Len()) - - for i := 0; i < v.Len(); i++ { - sliceResult[i] = structToInterface(v.Index(i)) - } - - return sliceResult + return handleSliceOrArray(v) case reflect.Map: - if v.IsNil() { - return nil - } - - mapResult := make(map[string]interface{}) - - for _, key := range v.MapKeys() { - mapResult[key.String()] = structToInterface(v.MapIndex(key)) - } - - return mapResult + return handleMap(v) + case reflect.Interface: + return handleInterface(v) + case reflect.Invalid, reflect.Chan, reflect.Func, reflect.UnsafePointer: + return nil case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, @@ -271,19 +249,90 @@ func structToInterface(v reflect.Value) interface{} { reflect.Complex64, reflect.Complex128, reflect.String: return v.Interface() - case reflect.Invalid: - return nil - case reflect.Chan, reflect.Func, reflect.UnsafePointer: + default: + return v.Interface() + } +} + +func handlePointer(v reflect.Value) interface{} { + if v.IsNil() { return nil - case reflect.Interface: - if v.IsNil() { - return nil + } + + return structToInterface(v.Elem()) +} + +func handleStruct(v reflect.Value) interface{} { + result := make(map[string]interface{}) + typ := v.Type() + + for i := 0; i < typ.NumField(); i++ { + field := typ.Field(i) + if field.PkgPath != "" { + continue } - return structToInterface(v.Elem()) - default: - return v.Interface() + jsonTag := field.Tag.Get("json") + if jsonTag == "" { + continue + } + + name := extractJSONFieldName(jsonTag, field.Name) + + fieldVal := structToInterface(v.Field(i)) + result[name] = fieldVal + } + + return result +} + +func extractJSONFieldName(jsonTag, fieldName string) string { + name := jsonTag + if idx := strings.Index(name, ","); idx != -1 { + name = name[:idx] + } + + if name == "" { + name = fieldName + } + + return name +} + +func handleSliceOrArray(v reflect.Value) interface{} { + if v.Kind() == reflect.Slice && v.IsNil() { + return nil } + + sliceResult := make([]interface{}, v.Len()) + + for i := 0; i < v.Len(); i++ { + sliceResult[i] = structToInterface(v.Index(i)) + } + + return sliceResult +} + +func handleMap(v reflect.Value) interface{} { + if v.IsNil() { + return nil + } + + mapResult := make(map[string]interface{}) + + for _, key := range v.MapKeys() { + mapResult[key.String()] = structToInterface(v.MapIndex(key)) + } + + return mapResult +} + +func handleInterface(v reflect.Value) interface{} { + if v.IsNil() { + return nil + } + + return structToInterface(v.Elem()) } // uses structToInterface for recursive processing diff --git a/fault-quarantine-module/pkg/evaluator/rule_evaluator_test.go b/fault-quarantine-module/pkg/evaluator/rule_evaluator_test.go index 6899f6d98..2dfb45577 100644 --- a/fault-quarantine-module/pkg/evaluator/rule_evaluator_test.go +++ b/fault-quarantine-module/pkg/evaluator/rule_evaluator_test.go @@ -15,22 +15,80 @@ package evaluator import ( + "context" + "log" + "os" "reflect" "testing" "time" + "github.com/nvidia/nvsentinel/data-models/pkg/protos" + "go.mongodb.org/mongo-driver/bson/primitive" "google.golang.org/protobuf/types/known/timestamppb" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/envtest" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/informer" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/nodeinfo" - "github.com/nvidia/nvsentinel/data-models/pkg/protos" ) +var ( + testClient *kubernetes.Clientset + testEnv *envtest.Environment +) + +func TestMain(m *testing.M) { + var err error + + testEnv = &envtest.Environment{} + + testRestConfig, err := testEnv.Start() + if err != nil { + log.Fatalf("Failed to start test environment: %v", err) + } + + testClient, err = kubernetes.NewForConfig(testRestConfig) + if err != nil { + log.Fatalf("Failed to create kubernetes client: %v", err) + } + + exitCode := m.Run() + + if err := testEnv.Stop(); err != nil { + log.Fatalf("Failed to stop test environment: %v", err) + } + os.Exit(exitCode) +} + +func createTestNode(ctx context.Context, t *testing.T, name string, labels map[string]string) { + t.Helper() + + if labels == nil { + labels = make(map[string]string) + } + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: labels, + }, + Spec: corev1.NodeSpec{}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }, + }, + } + + _, err := testClient.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test node %s: %v", name, err) + } +} + func TestEvaluate(t *testing.T) { expression := "event.agent == 'GPU' && event.checkName == 'XidError' && ('31' in event.errorCode || '42' in event.errorCode)" evaluator, err := NewHealthEventRuleEvaluator(expression) @@ -113,49 +171,35 @@ func TestNodeToSkipLabelRuleEvaluator(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + nodeName := "test-node-" + primitive.NewObjectID().Hex()[:8] - // Create mock node object with labels from test case - node := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-node", - Labels: tt.nodeLabels, // Keep original labels from test case - }, - Spec: corev1.NodeSpec{}, - } + createTestNode(ctx, t, nodeName, tt.nodeLabels) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() - // Ensure the required label for the informer exists. - // The NodeInformer specifically looks for GpuNodeLabel ("nvidia.com/gpu.present"). - if node.Labels == nil { - node.Labels = make(map[string]string) - } - // Add the label the informer expects, preserving existing labels - node.Labels[informer.GpuNodeLabel] = "true" - - clientset := fake.NewSimpleClientset(node) - workSignal := make(chan struct{}, 1) - // Use 0 resync period for tests unless specific timing is needed - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - nodeInformer, err := informer.NewNodeInformer(clientset, 0, workSignal, nodeInfo) + nodeInformer, err := informer.NewNodeInformer(testClient, 0) if err != nil { t.Fatalf("Failed to create NodeInformer: %v", err) } + stopCh := make(chan struct{}) defer close(stopCh) go nodeInformer.Run(stopCh) - // Wait for the cache to sync - if ok := cache.WaitForCacheSync(stopCh, nodeInformer.HasSynced); !ok { - t.Fatalf("failed to wait for caches to sync") + if ok := cache.WaitForCacheSync(stopCh, nodeInformer.GetInformer().HasSynced); !ok { + t.Fatalf("NodeInformer failed to sync") } - // Create evaluator with mocked client + evaluator, err := NewNodeRuleEvaluator(tt.expression, nodeInformer.Lister()) if err != nil && !tt.expectError { t.Fatalf("Failed to create NodeToSkipLabelRuleEvaluator: %v", err) } if evaluator != nil { isEvaluated, err := evaluator.Evaluate(&protos.HealthEvent{ - NodeName: "test-node", + NodeName: nodeName, }) if (err != nil) != tt.expectError { t.Errorf("Failed to evaluate expression: %s: %+v", tt.name, err) diff --git a/fault-quarantine-module/pkg/evaluator/rule_set_evaluator.go b/fault-quarantine-module/pkg/evaluator/rule_set_evaluator.go index 84e739434..9df4320dd 100644 --- a/fault-quarantine-module/pkg/evaluator/rule_set_evaluator.go +++ b/fault-quarantine-module/pkg/evaluator/rule_set_evaluator.go @@ -18,15 +18,15 @@ import ( "fmt" "log/slog" + multierror "github.com/hashicorp/go-multierror" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/informer" - - multierror "github.com/hashicorp/go-multierror" - "k8s.io/client-go/kubernetes" ) -func InitializeRuleSetEvaluators(ruleSets []config.RuleSet, - client kubernetes.Interface, nodeInformer *informer.NodeInformer) ([]RuleSetEvaluatorIface, error) { +func InitializeRuleSetEvaluators( + ruleSets []config.RuleSet, + nodeInformer *informer.NodeInformer, +) ([]RuleSetEvaluatorIface, error) { var ( ruleSetEvals []RuleSetEvaluatorIface errs *multierror.Error @@ -35,7 +35,7 @@ func InitializeRuleSetEvaluators(ruleSets []config.RuleSet, for _, ruleSet := range ruleSets { // We can extend this to add different types of match based rules if len(ruleSet.Match.Any) > 0 { - evaluators, err := createEvaluators(ruleSet.Match.Any, client, nodeInformer) + evaluators, err := createEvaluators(ruleSet.Match.Any, nodeInformer) if err != nil { errs = multierror.Append(errs, err) } else { @@ -47,7 +47,7 @@ func InitializeRuleSetEvaluators(ruleSets []config.RuleSet, } if len(ruleSet.Match.All) > 0 { - evaluators, err := createEvaluators(ruleSet.Match.All, client, nodeInformer) + evaluators, err := createEvaluators(ruleSet.Match.All, nodeInformer) if err != nil { errs = multierror.Append(errs, err) } else { @@ -62,8 +62,7 @@ func InitializeRuleSetEvaluators(ruleSets []config.RuleSet, return ruleSetEvals, errs.ErrorOrNil() } -func createEvaluators(rules []config.Rule, client kubernetes.Interface, - nodeInformer *informer.NodeInformer) ([]RuleEvaluator, error) { +func createEvaluators(rules []config.Rule, nodeInformer *informer.NodeInformer) ([]RuleEvaluator, error) { evaluators := []RuleEvaluator{} var errs *multierror.Error diff --git a/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_any.go b/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_any.go index 78dbe6f2e..df0ccd07b 100644 --- a/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_any.go +++ b/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_any.go @@ -44,7 +44,7 @@ func (anyEval *AnyRuleSetEvaluator) Evaluate( } if errs.ErrorOrNil() != nil { - return common.RuleEvaluationErroredOut, errs + return common.RuleEvaluationFailed, errs } return common.RuleEvaluationFailed, nil diff --git a/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_test.go b/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_test.go index fb1925ceb..8971bec0e 100644 --- a/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_test.go +++ b/fault-quarantine-module/pkg/evaluator/rule_set_evaluator_test.go @@ -20,10 +20,9 @@ import ( "testing" multierror "github.com/hashicorp/go-multierror" + "github.com/nvidia/nvsentinel/data-models/pkg/protos" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" - "github.com/nvidia/nvsentinel/data-models/pkg/protos" - "k8s.io/client-go/kubernetes/fake" ) type MockRuleEvaluator struct { @@ -73,7 +72,7 @@ func TestAnyRuleSetEvaluator_Evaluate(t *testing.T) { &MockRuleEvaluator{result: false, err: errors.New("evaluation error")}, }, event: &protos.HealthEvent{}, - expected: common.RuleEvaluationErroredOut, + expected: common.RuleEvaluationFailed, expectErr: true, }, { @@ -93,7 +92,7 @@ func TestAnyRuleSetEvaluator_Evaluate(t *testing.T) { &MockRuleEvaluator{result: false, err: errors.New("error 2")}, }, event: &protos.HealthEvent{}, - expected: common.RuleEvaluationErroredOut, + expected: common.RuleEvaluationFailed, expectErr: true, }, } @@ -289,11 +288,9 @@ func TestInitializeRuleSetEvaluators(t *testing.T) { }, } - clientset := fake.NewSimpleClientset() - for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - evaluators, err := InitializeRuleSetEvaluators(tt.ruleSets, clientset, nil) + evaluators, err := InitializeRuleSetEvaluators(tt.ruleSets, nil) if len(evaluators) != tt.expectedCount { t.Errorf("Expected %d evaluators, got %d", tt.expectedCount, len(evaluators)) } @@ -365,10 +362,9 @@ func TestCreateEvaluators(t *testing.T) { }, } - clientset := fake.NewSimpleClientset() for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - evaluators, err := createEvaluators(tt.rules, clientset, nil) + evaluators, err := createEvaluators(tt.rules, nil) if len(evaluators) != tt.expectedCount { t.Errorf("Expected %d evaluators, got %d", tt.expectedCount, len(evaluators)) } diff --git a/fault-quarantine-module/pkg/healthEventsAnnotation/health_events_annotation_map.go b/fault-quarantine-module/pkg/healthEventsAnnotation/health_events_annotation_map.go index 53da00f80..3ceea8f4c 100644 --- a/fault-quarantine-module/pkg/healthEventsAnnotation/health_events_annotation_map.go +++ b/fault-quarantine-module/pkg/healthEventsAnnotation/health_events_annotation_map.go @@ -74,7 +74,6 @@ func createEventKeyForEntity( // createEventKeys creates keys for all entities in a HealthEvent func createEventKeys(event *protos.HealthEvent) []HealthEventKey { if len(event.EntitiesImpacted) == 0 { - // If no entities, create a single key without entity info return []HealthEventKey{createEventKeyForEntity(event, nil)} } @@ -189,7 +188,6 @@ func (he *HealthEventsAnnotationMap) RemoveEvent(event *protos.HealthEvent) int } } - // Then remove them for _, key := range keys { delete(he.Events, key) } @@ -215,7 +213,6 @@ func (he *HealthEventsAnnotationMap) removeAllEntitiesForCheck(event *protos.Hea } } - // Remove all matching keys for _, key := range keysToRemove { delete(he.Events, key) } @@ -225,7 +222,6 @@ func (he *HealthEventsAnnotationMap) removeAllEntitiesForCheck(event *protos.Hea // RemoveEntitiesForCheck removes specific entities for a check func (he *HealthEventsAnnotationMap) RemoveEntitiesForCheck(event *protos.HealthEvent) { - // Remove each entity specified in the event keys := createEventKeys(event) for _, key := range keys { delete(he.Events, key) @@ -277,12 +273,10 @@ func (he *HealthEventsAnnotationMap) UnmarshalJSON(data []byte) error { return fmt.Errorf("failed to unmarshal health events: %w", err) } - // Initialize the map if needed if he.Events == nil { he.Events = make(map[HealthEventKey]*protos.HealthEvent) } - // Clear existing events and add the unmarshaled ones for k := range he.Events { delete(he.Events, k) } diff --git a/fault-quarantine-module/pkg/informer/k8s_client.go b/fault-quarantine-module/pkg/informer/k8s_client.go new file mode 100644 index 000000000..c41aa5456 --- /dev/null +++ b/fault-quarantine-module/pkg/informer/k8s_client.go @@ -0,0 +1,462 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package informer + +import ( + "context" + "fmt" + "log/slog" + "sync" + "time" + + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/breaker" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/util/retry" +) + +var customBackoff = wait.Backoff{ + Steps: 10, + Duration: 10 * time.Millisecond, + Factor: 1.5, + Jitter: 0.1, +} + +type FaultQuarantineClient struct { + Clientset kubernetes.Interface + DryRunMode bool + NodeInformer *NodeInformer + cordonedReasonLabelKey string + uncordonedReasonLabelKey string + operationMutex sync.Map // map[string]*sync.Mutex for per-node locking +} + +func NewFaultQuarantineClient(kubeconfig string, dryRun bool, + resyncPeriod time.Duration) (*FaultQuarantineClient, error) { + config, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + return nil, fmt.Errorf("error creating Kubernetes config: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("error creating clientset: %w", err) + } + + nodeInformer, err := NewNodeInformer(clientset, resyncPeriod) + if err != nil { + return nil, fmt.Errorf("error creating node informer: %w", err) + } + + client := &FaultQuarantineClient{ + Clientset: clientset, + DryRunMode: dryRun, + NodeInformer: nodeInformer, + } + + return client, nil +} + +func (c *FaultQuarantineClient) EnsureCircuitBreakerConfigMap(ctx context.Context, + name, namespace string, initialStatus breaker.State) error { + slog.Info("Ensuring circuit breaker config map", + "name", name, "namespace", namespace, "initialStatus", initialStatus) + + cmClient := c.Clientset.CoreV1().ConfigMaps(namespace) + + _, err := cmClient.Get(ctx, name, metav1.GetOptions{}) + if err == nil { + slog.Info("Circuit breaker config map already exists", "name", name, "namespace", namespace) + return nil + } + + if !errors.IsNotFound(err) { + slog.Error("Error getting circuit breaker config map", "name", name, "namespace", namespace, "error", err) + return fmt.Errorf("failed to get config map %s in namespace %s: %w", name, namespace, err) + } + + cm := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, + Data: map[string]string{"status": string(initialStatus)}, + } + + _, err = cmClient.Create(ctx, cm, metav1.CreateOptions{}) + if err != nil { + slog.Error("Error creating circuit breaker config map", "name", name, "namespace", namespace, "error", err) + return fmt.Errorf("failed to create config map %s in namespace %s: %w", name, namespace, err) + } + + return nil +} + +func (c *FaultQuarantineClient) GetTotalNodes(ctx context.Context) (int, error) { + totalNodes, _, err := c.NodeInformer.GetNodeCounts() + if err != nil { + return 0, fmt.Errorf("failed to get node counts from informer: %w", err) + } + + slog.Debug("Got total nodes from NodeInformer cache", "totalNodes", totalNodes) + + return totalNodes, nil +} + +func (c *FaultQuarantineClient) SetLabelKeys(cordonedReasonKey, uncordonedReasonKey string) { + c.cordonedReasonLabelKey = cordonedReasonKey + c.uncordonedReasonLabelKey = uncordonedReasonKey +} + +func (c *FaultQuarantineClient) UpdateNode(ctx context.Context, nodeName string, updateFn func(*v1.Node) error) error { + mu, _ := c.operationMutex.LoadOrStore(nodeName, &sync.Mutex{}) + mu.(*sync.Mutex).Lock() + defer mu.(*sync.Mutex).Unlock() + + return retry.OnError(retry.DefaultBackoff, errors.IsConflict, func() error { + node, err := c.Clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + + if err := updateFn(node); err != nil { + return err + } + + _, err = c.Clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) + if err != nil { + return err + } + + slog.Debug("Updated node", "node", nodeName) + + return nil + }) +} + +func (c *FaultQuarantineClient) ReadCircuitBreakerState( + ctx context.Context, name, namespace string, +) (breaker.State, error) { + slog.Info("Reading circuit breaker state from config map", + "name", name, "namespace", namespace) + + cm, err := c.Clientset.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return "", fmt.Errorf("failed to get config map %s in namespace %s: %w", name, namespace, err) + } + + if cm.Data == nil { + return "", nil + } + + return breaker.State(cm.Data["status"]), nil +} + +func (c *FaultQuarantineClient) WriteCircuitBreakerState( + ctx context.Context, name, namespace string, state breaker.State, +) error { + cmClient := c.Clientset.CoreV1().ConfigMaps(namespace) + + return retry.OnError(customBackoff, errors.IsConflict, func() error { + cm, err := cmClient.Get(ctx, name, metav1.GetOptions{}) + if err != nil { + slog.Error("Error getting circuit breaker config map", "name", name, "namespace", namespace, "error", err) + return err + } + + if cm.Data == nil { + cm.Data = map[string]string{} + } + + cm.Data["status"] = string(state) + + _, err = cmClient.Update(ctx, cm, metav1.UpdateOptions{}) + if err != nil { + slog.Error("Error updating circuit breaker config map", "name", name, "namespace", namespace, "error", err) + } + + return err + }) +} + +func (c *FaultQuarantineClient) QuarantineNodeAndSetAnnotations( + ctx context.Context, + nodename string, + taints []config.Taint, + isCordon bool, + annotations map[string]string, + labels map[string]string, +) error { + updateFn := func(node *v1.Node) error { + if len(taints) > 0 { + if err := c.applyTaints(node, taints, nodename); err != nil { + return fmt.Errorf("failed to apply taints to node %s: %w", nodename, err) + } + } + + if isCordon { + if shouldSkip := c.handleCordon(node, nodename); shouldSkip { + return nil + } + } + + if len(annotations) > 0 { + c.applyAnnotations(node, annotations, nodename) + } + + if len(labels) > 0 { + c.applyLabels(node, labels, nodename) + } + + return nil + } + + return c.UpdateNode(ctx, nodename, updateFn) +} + +func (c *FaultQuarantineClient) applyTaints(node *v1.Node, taints []config.Taint, nodename string) error { + existingTaints := make(map[config.Taint]v1.Taint) + for _, taint := range node.Spec.Taints { + existingTaints[config.Taint{Key: taint.Key, Value: taint.Value, Effect: string(taint.Effect)}] = taint + } + + for _, taintConfig := range taints { + key := config.Taint{Key: taintConfig.Key, Value: taintConfig.Value, Effect: string(taintConfig.Effect)} + + if _, exists := existingTaints[key]; !exists { + slog.Info("Tainting node", "node", nodename, "taintConfig", taintConfig) + existingTaints[key] = v1.Taint{ + Key: taintConfig.Key, + Value: taintConfig.Value, + Effect: v1.TaintEffect(taintConfig.Effect), + } + } + } + + node.Spec.Taints = []v1.Taint{} + for _, taint := range existingTaints { + node.Spec.Taints = append(node.Spec.Taints, taint) + } + + return nil +} + +func (c *FaultQuarantineClient) handleCordon(node *v1.Node, nodename string) bool { + _, exist := node.Annotations[common.QuarantineHealthEventAnnotationKey] + if node.Spec.Unschedulable { + if exist { + slog.Info("Node already cordoned by FQM; skipping taint/annotation updates", "node", nodename) + return true + } + + slog.Info("Node is cordoned manually; applying FQM taints/annotations", "node", nodename) + } else { + slog.Info("Cordoning node", "node", nodename) + + if !c.DryRunMode { + node.Spec.Unschedulable = true + } + } + + return false +} + +func (c *FaultQuarantineClient) applyAnnotations(node *v1.Node, annotations map[string]string, nodename string) { + if node.Annotations == nil { + node.Annotations = make(map[string]string) + } + + slog.Info("Setting annotations on node", "node", nodename, "annotations", annotations) + + for annotationKey, annotationValue := range annotations { + node.Annotations[annotationKey] = annotationValue + } +} + +func (c *FaultQuarantineClient) applyLabels(node *v1.Node, labels map[string]string, nodename string) { + if node.Labels == nil { + node.Labels = make(map[string]string) + } + + slog.Info("Adding labels on node", "node", nodename) + + for k, v := range labels { + node.Labels[k] = v + } +} + +func (c *FaultQuarantineClient) UnQuarantineNodeAndRemoveAnnotations( + ctx context.Context, + nodename string, + taints []config.Taint, + annotationKeys []string, + labelsToRemove []string, + labels map[string]string, +) error { + updateFn := func(node *v1.Node) error { + if len(taints) > 0 { + if shouldReturn := c.removeTaints(node, taints, nodename); shouldReturn { + return nil + } + } + + c.handleUncordon(node, labels, nodename) + + if len(annotationKeys) > 0 { + for _, annotationKey := range annotationKeys { + slog.Info("Removing annotation key from node", "key", annotationKey, "node", nodename) + delete(node.Annotations, annotationKey) + } + } + + if len(labelsToRemove) > 0 { + for _, labelKey := range labelsToRemove { + slog.Info("Removing label key from node", "key", labelKey, "node", nodename) + delete(node.Labels, labelKey) + } + } + + return nil + } + + return c.UpdateNode(ctx, nodename, updateFn) +} + +func (c *FaultQuarantineClient) removeTaints(node *v1.Node, taints []config.Taint, nodename string) bool { + taintsAlreadyPresentOnNodeMap := map[config.Taint]bool{} + for _, taint := range node.Spec.Taints { + taintsAlreadyPresentOnNodeMap[config.Taint{Key: taint.Key, Value: taint.Value, Effect: string(taint.Effect)}] = true + } + + taintsToActuallyRemove := []config.Taint{} + + for _, taintConfig := range taints { + key := config.Taint{ + Key: taintConfig.Key, + Value: taintConfig.Value, + Effect: taintConfig.Effect, + } + + found := taintsAlreadyPresentOnNodeMap[key] + if !found { + slog.Info("Node already does not have the taint", "node", nodename, "taint", taintConfig) + } else { + taintsToActuallyRemove = append(taintsToActuallyRemove, taintConfig) + } + } + + if len(taintsToActuallyRemove) == 0 { + return true + } + + slog.Info("Untainting node", "node", nodename, "taints", taintsToActuallyRemove) + + c.removeNodeTaints(node, taintsToActuallyRemove) + + return false +} + +func (c *FaultQuarantineClient) handleUncordon( + node *v1.Node, labels map[string]string, nodename string, +) { + slog.Info("Uncordoning node", "node", nodename) + + if !c.DryRunMode { + node.Spec.Unschedulable = false + } + + if len(labels) > 0 { + c.applyLabels(node, labels, nodename) + + uncordonReason := node.Labels[c.cordonedReasonLabelKey] + + if uncordonReason != "" { + if len(uncordonReason) > 55 { + uncordonReason = uncordonReason[:55] + } + + node.Labels[c.uncordonedReasonLabelKey] = uncordonReason + "-removed" + } + } +} + +// HandleManualUncordonCleanup atomically removes FQ annotations/taints/labels and adds manual uncordon annotation +// This is used when a node is manually uncordoned while having FQ quarantine state +func (c *FaultQuarantineClient) HandleManualUncordonCleanup( + ctx context.Context, + nodename string, + taintsToRemove []config.Taint, + annotationsToRemove []string, + annotationsToAdd map[string]string, + labelsToRemove []string, +) error { + updateFn := func(node *v1.Node) error { + if len(taintsToRemove) > 0 { + c.removeNodeTaints(node, taintsToRemove) + } + + if len(annotationsToRemove) > 0 || len(annotationsToAdd) > 0 { + c.updateNodeAnnotationsForManualUncordon(node, annotationsToRemove, annotationsToAdd) + } + + if len(labelsToRemove) > 0 { + for _, key := range labelsToRemove { + delete(node.Labels, key) + } + } + + return nil + } + + return c.UpdateNode(ctx, nodename, updateFn) +} + +func (c *FaultQuarantineClient) removeNodeTaints(node *v1.Node, taintsToRemove []config.Taint) { + taintsToRemoveMap := make(map[config.Taint]bool, len(taintsToRemove)) + for _, taint := range taintsToRemove { + taintsToRemoveMap[taint] = true + } + + newTaints := make([]v1.Taint, 0, len(node.Spec.Taints)) + + for _, taint := range node.Spec.Taints { + if !taintsToRemoveMap[config.Taint{Key: taint.Key, Value: taint.Value, Effect: string(taint.Effect)}] { + newTaints = append(newTaints, taint) + } + } + + node.Spec.Taints = newTaints +} + +func (c *FaultQuarantineClient) updateNodeAnnotationsForManualUncordon( + node *v1.Node, + annotationsToRemove []string, + annotationsToAdd map[string]string, +) { + if node.Annotations == nil { + node.Annotations = make(map[string]string) + } + + for _, key := range annotationsToRemove { + delete(node.Annotations, key) + } + + for key, value := range annotationsToAdd { + node.Annotations[key] = value + } +} diff --git a/fault-quarantine-module/pkg/reconciler/k8s_client_iface.go b/fault-quarantine-module/pkg/informer/k8s_client_interface.go similarity index 56% rename from fault-quarantine-module/pkg/reconciler/k8s_client_iface.go rename to fault-quarantine-module/pkg/informer/k8s_client_interface.go index 591354a26..68b337478 100644 --- a/fault-quarantine-module/pkg/reconciler/k8s_client_iface.go +++ b/fault-quarantine-module/pkg/informer/k8s_client_interface.go @@ -12,28 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -package reconciler +package informer import ( "context" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/breaker" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" - "k8s.io/client-go/kubernetes" + v1 "k8s.io/api/core/v1" ) // K8sClientInterface defines the methods used by Reconciler from k8sClient type K8sClientInterface interface { - GetNodeAnnotations(ctx context.Context, nodeName string) (map[string]string, error) - GetNodesWithAnnotation(ctx context.Context, annotationKey string) ([]string, error) - TaintAndCordonNodeAndSetAnnotations(ctx context.Context, nodeName string, + QuarantineNodeAndSetAnnotations(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error - UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx context.Context, nodeName string, - taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, + UnQuarantineNodeAndRemoveAnnotations(ctx context.Context, nodeName string, + taints []config.Taint, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error - UpdateNodeAnnotations(ctx context.Context, nodeName string, annotations map[string]string) error - GetK8sClient() kubernetes.Interface - EnsureCircuitBreakerConfigMap(ctx context.Context, name, namespace string, initialStatus string) error - ReadCircuitBreakerState(ctx context.Context, name, namespace string) (string, error) - WriteCircuitBreakerState(ctx context.Context, name, namespace, status string) error - GetTotalGpuNodes(ctx context.Context) (int, error) + HandleManualUncordonCleanup(ctx context.Context, nodeName string, taintsToRemove []config.Taint, + annotationsToRemove []string, annotationsToAdd map[string]string, labelsToRemove []string) error + UpdateNode(ctx context.Context, nodeName string, updateFn func(*v1.Node) error) error + EnsureCircuitBreakerConfigMap(ctx context.Context, name, namespace string, initialStatus breaker.State) error + ReadCircuitBreakerState(ctx context.Context, name, namespace string) (breaker.State, error) + WriteCircuitBreakerState(ctx context.Context, name, namespace string, state breaker.State) error + GetTotalNodes(ctx context.Context) (int, error) } diff --git a/fault-quarantine-module/pkg/informer/k8s_client_test.go b/fault-quarantine-module/pkg/informer/k8s_client_test.go new file mode 100644 index 000000000..8b05826c3 --- /dev/null +++ b/fault-quarantine-module/pkg/informer/k8s_client_test.go @@ -0,0 +1,704 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package informer + +import ( + "context" + "log" + "os" + "testing" + "time" + + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" + "go.mongodb.org/mongo-driver/bson/primitive" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/envtest" +) + +const ( + cordonedByLabelKey = "test-cordon-by" + cordonedReasonLabelKey = "test-cordon-reason" + cordonedTimestampLabelKey = "test-cordon-timestamp" + uncordonedByLabelKey = "test-uncordon-by" + uncordonedReasonLabelKey = "test-uncordon-reason" + uncordonedTimestampLabelKey = "test-uncordon-timestamp" +) + +var ( + testClient *kubernetes.Clientset + testEnv *envtest.Environment +) + +func TestMain(m *testing.M) { + var err error + + testEnv = &envtest.Environment{} + + testRestConfig, err := testEnv.Start() + if err != nil { + log.Fatalf("Failed to start test environment: %v", err) + } + + testClient, err = kubernetes.NewForConfig(testRestConfig) + if err != nil { + log.Fatalf("Failed to create kubernetes client: %v", err) + } + + exitCode := m.Run() + + if err := testEnv.Stop(); err != nil { + log.Fatalf("Failed to stop test environment: %v", err) + } + os.Exit(exitCode) +} + +func setupTestClient(t *testing.T) *FaultQuarantineClient { + t.Helper() + + client := &FaultQuarantineClient{ + Clientset: testClient, + DryRunMode: false, + } + + nodeInformer, err := NewNodeInformer(testClient, 0) + if err != nil { + t.Fatalf("Failed to create NodeInformer: %v", err) + } + + stopCh := make(chan struct{}) + t.Cleanup(func() { close(stopCh) }) + + go nodeInformer.Run(stopCh) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err = wait.PollUntilContextTimeout(ctx, 50*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { + return nodeInformer.HasSynced(), nil + }) + if err != nil { + t.Fatalf("NodeInformer failed to sync: %v", err) + } + + client.NodeInformer = nodeInformer + client.SetLabelKeys(cordonedReasonLabelKey, uncordonedReasonLabelKey) + + return client +} + +func createTestNode(ctx context.Context, t *testing.T, name string, annotations map[string]string, labels map[string]string, taints []v1.Taint, unschedulable bool) { + t.Helper() + + if labels == nil { + labels = make(map[string]string) + } + + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Annotations: annotations, + Labels: labels, + }, + Spec: v1.NodeSpec{ + Unschedulable: unschedulable, + Taints: taints, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + {Type: v1.NodeReady, Status: v1.ConditionTrue}, + }, + }, + } + + _, err := testClient.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test node %s: %v", name, err) + } +} + +func TestQuarantineNodeAndSetAnnotations(t *testing.T) { + ctx := context.Background() + nodeName := "test-taint-cordon-" + primitive.NewObjectID().Hex()[:8] + + createTestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + taints := []config.Taint{ + { + Key: "test-key", + Value: "test-value", + Effect: "NoSchedule", + }, + } + annotations := map[string]string{ + "test-annotation": "test-value", + } + + labelsMap := map[string]string{ + cordonedByLabelKey: common.ServiceName, + cordonedReasonLabelKey: "gpu-error", + cordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), + } + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, nodeName, taints, true, annotations, labelsMap) + if err != nil { + t.Fatalf("QuarantineNodeAndSetAnnotations failed: %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Check taints (filter out automatic taints added by envtest like node.kubernetes.io/not-ready) + var testTaints []v1.Taint + for _, taint := range updatedNode.Spec.Taints { + if taint.Key == "test-key" { + testTaints = append(testTaints, taint) + } + } + if len(testTaints) != 1 { + t.Errorf("Expected 1 test taint, got %d", len(testTaints)) + } + if len(testTaints) > 0 && testTaints[0].Key != "test-key" { + t.Errorf("Unexpected taint key: %s", testTaints[0].Key) + } + + // Check cordon + if !updatedNode.Spec.Unschedulable { + t.Errorf("Node should be cordoned") + } + // Check that cordon labels are present (node also has GPU label, so total is 4) + if updatedNode.Labels[cordonedByLabelKey] != common.ServiceName { + t.Errorf("Expected cordon-by label to be %s, got %s", common.ServiceName, updatedNode.Labels[cordonedByLabelKey]) + } + if updatedNode.Labels[cordonedReasonLabelKey] != "gpu-error" { + t.Errorf("Expected cordon-reason label to be gpu-error, got %s", updatedNode.Labels[cordonedReasonLabelKey]) + } + if updatedNode.Labels[cordonedTimestampLabelKey] == "" { + t.Errorf("Expected cordon-timestamp label to be set") + } + + // Check annotations + if val, ok := updatedNode.Annotations["test-annotation"]; !ok || val != "test-value" { + t.Errorf("Annotation not set correctly") + } +} + +func TestUnQuarantineNodeAndRemoveAnnotations(t *testing.T) { + ctx := context.Background() + nodeName := "test-untaint-uncordon-" + primitive.NewObjectID().Hex()[:8] + + annotations := map[string]string{ + "test-annotation": "test-value", + } + labels := map[string]string{ + cordonedByLabelKey: common.ServiceName, + cordonedReasonLabelKey: "gpu-error", + cordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), + } + taints := []v1.Taint{ + { + Key: "test-key", + Value: "test-value", + Effect: v1.TaintEffect("NoSchedule"), + }, + } + + createTestNode(ctx, t, nodeName, annotations, labels, taints, true) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + taintsToRemove := []config.Taint{ + { + Key: "test-key", + Value: "test-value", + Effect: "NoSchedule", + }, + } + annotationKeys := []string{"test-annotation"} + + labelsMap := map[string]string{ + uncordonedByLabelKey: common.ServiceName, + uncordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), + } + + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, nodeName, taintsToRemove, annotationKeys, []string{cordonedByLabelKey, cordonedReasonLabelKey, cordonedTimestampLabelKey}, labelsMap) + if err != nil { + t.Fatalf("UnQuarantineNodeAndRemoveAnnotations failed: %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Check that our test taint was removed (filter out automatic taints from envtest) + var testTaints []v1.Taint + for _, taint := range updatedNode.Spec.Taints { + if taint.Key == "test-key" { + testTaints = append(testTaints, taint) + } + } + if len(testTaints) != 0 { + t.Errorf("Expected 0 test taints, got %d", len(testTaints)) + } + + if updatedNode.Spec.Unschedulable { + t.Errorf("Node should be uncordoned") + } + + if _, ok := updatedNode.Annotations["test-annotation"]; ok { + t.Errorf("Annotation should be removed") + } + + _, exists1 := updatedNode.Labels[cordonedByLabelKey] + _, exists2 := updatedNode.Labels[cordonedReasonLabelKey] + _, exists3 := updatedNode.Labels[cordonedTimestampLabelKey] + + if exists1 || exists2 || exists3 { + t.Errorf("Expected cordoned labels to be removed from node") + } + + // Check that uncordon labels are present (node also has GPU label) + if updatedNode.Labels[uncordonedByLabelKey] != common.ServiceName { + t.Errorf("Expected uncordon-by label to be %s, got %s", common.ServiceName, updatedNode.Labels[uncordonedByLabelKey]) + } + if updatedNode.Labels[uncordonedReasonLabelKey] != "gpu-error-removed" { + t.Errorf("Expected uncordon-reason label to be gpu-error-removed, got %s", updatedNode.Labels[uncordonedReasonLabelKey]) + } + if updatedNode.Labels[uncordonedTimestampLabelKey] == "" { + t.Errorf("Expected uncordon-timestamp label to be set") + } +} + +func TestTaintAndCordonNode_NodeNotFound(t *testing.T) { + ctx := context.Background() + k8sClient := setupTestClient(t) + + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, "non-existent-node", nil, false, nil, map[string]string{}) + if err == nil { + t.Errorf("Expected error when node does not exist, got nil") + } +} + +func TestTaintAndCordonNode_NoChanges(t *testing.T) { + ctx := context.Background() + nodeName := "test-no-change-" + primitive.NewObjectID().Hex()[:8] + + createTestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, nodeName, nil, false, nil, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // envtest may add automatic taints, so we just verify no custom taints were added + // The function should not have added any custom taints + if updatedNode.Spec.Unschedulable { + t.Errorf("Expected node to remain schedulable") + } + if len(updatedNode.Annotations) != 0 { + t.Errorf("Expected no annotations, got %v", updatedNode.Annotations) + } +} + +func TestUnTaintAndUnCordonNode_NoChanges(t *testing.T) { + ctx := context.Background() + nodeName := "test-no-change-untaint-" + primitive.NewObjectID().Hex()[:8] + + createTestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, nodeName, nil, nil, []string{}, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Function always uncordons, so node should be schedulable + if updatedNode.Spec.Unschedulable { + t.Errorf("Expected node to be uncordoned") + } + if len(updatedNode.Annotations) != 0 { + t.Errorf("Expected no annotations, got %v", updatedNode.Annotations) + } +} + +func TestUnTaintAndUnCordonNode_PartialTaintRemoval(t *testing.T) { + ctx := context.Background() + nodeName := "test-partial-taint-" + primitive.NewObjectID().Hex()[:8] + + taints := []v1.Taint{ + {Key: "taint1", Value: "val1", Effect: v1.TaintEffectNoSchedule}, + {Key: "taint2", Value: "val2", Effect: v1.TaintEffectPreferNoSchedule}, + } + + createTestNode(ctx, t, nodeName, nil, nil, taints, true) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + taintsToRemove := []config.Taint{{Key: "taint1", Value: "val1", Effect: "NoSchedule"}} + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, nodeName, taintsToRemove, nil, []string{}, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Filter to test taints only (ignore automatic envtest taints) + var testTaints []v1.Taint + for _, taint := range updatedNode.Spec.Taints { + if taint.Key == "taint1" || taint.Key == "taint2" { + testTaints = append(testTaints, taint) + } + } + if len(testTaints) != 1 { + t.Errorf("Expected 1 test taint remaining, got %d", len(testTaints)) + } + if len(testTaints) > 0 && testTaints[0].Key != "taint2" { + t.Errorf("Expected taint2 to remain, got %s", testTaints[0].Key) + } +} + +func TestUnTaintAndUnCordonNode_PartialAnnotationRemoval(t *testing.T) { + ctx := context.Background() + nodeName := "test-partial-annotation-" + primitive.NewObjectID().Hex()[:8] + + annotations := map[string]string{ + "annotation1": "val1", + "annotation2": "val2", + } + labels := map[string]string{ + cordonedByLabelKey: common.ServiceName, + cordonedReasonLabelKey: "gpu-error", + cordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), + } + + createTestNode(ctx, t, nodeName, annotations, labels, nil, true) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + annotationsToRemove := []string{"annotation1"} + labelsMap := map[string]string{ + uncordonedByLabelKey: common.ServiceName, + uncordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), + } + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, nodeName, nil, annotationsToRemove, []string{cordonedByLabelKey, cordonedReasonLabelKey, cordonedTimestampLabelKey}, labelsMap) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + if _, ok := updatedNode.Annotations["annotation1"]; ok { + t.Errorf("Expected annotation1 to be removed") + } + if updatedNode.Annotations["annotation2"] != "val2" { + t.Errorf("Expected annotation2 to remain") + } + if updatedNode.Spec.Unschedulable { + t.Errorf("Expected node to be uncordoned") + } +} + +func TestTaintAndCordonNode_AlreadyTaintedCordoned(t *testing.T) { + ctx := context.Background() + nodeName := "test-already-tainted-" + primitive.NewObjectID().Hex()[:8] + + taints := []v1.Taint{ + {Key: "test-key", Value: "test-value", Effect: v1.TaintEffectNoSchedule}, + } + + createTestNode(ctx, t, nodeName, nil, nil, taints, true) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + taintsToAdd := []config.Taint{{Key: "test-key", Value: "test-value", Effect: "NoSchedule"}} + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, nodeName, taintsToAdd, true, nil, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Filter to test taint only (ignore automatic envtest taints) + var testTaints []v1.Taint + for _, taint := range updatedNode.Spec.Taints { + if taint.Key == "test-key" { + testTaints = append(testTaints, taint) + } + } + if len(testTaints) != 1 { + t.Errorf("Expected 1 test taint, got %d", len(testTaints)) + } + if !updatedNode.Spec.Unschedulable { + t.Errorf("Node should remain cordoned") + } +} + +func TestUnTaintAndUnCordonNode_AlreadyUntaintedUncordoned(t *testing.T) { + ctx := context.Background() + nodeName := "test-already-untainted-" + primitive.NewObjectID().Hex()[:8] + + createTestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, nodeName, nil, nil, []string{}, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Function always uncordons, so node should be schedulable + if updatedNode.Spec.Unschedulable { + t.Errorf("Expected node to be uncordoned") + } +} + +func TestTaintAndCordonNode_InvalidTaintEffect(t *testing.T) { + ctx := context.Background() + nodeName := "test-invalid-effect-" + primitive.NewObjectID().Hex()[:8] + + createTestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + // envtest validates taint effects, so invalid effects should now return an error + taints := []config.Taint{{Key: "weird-key", Value: "weird-value", Effect: "SomeInvalidEffect"}} + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, nodeName, taints, false, nil, map[string]string{}) + if err == nil { + t.Errorf("Expected error for invalid taint effect, got nil") + } +} + +func TestTaintAndCordonNode_OverwriteAnnotation(t *testing.T) { + ctx := context.Background() + nodeName := "test-overwrite-annotation-" + primitive.NewObjectID().Hex()[:8] + + existingAnnotations := map[string]string{"existing-key": "old-value"} + + createTestNode(ctx, t, nodeName, existingAnnotations, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + annotations := map[string]string{"existing-key": "new-value"} + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, nodeName, nil, false, annotations, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + if updatedNode.Annotations["existing-key"] != "new-value" { + t.Errorf("Annotation value was not updated correctly") + } +} + +func TestUnTaintAndUnCordonNode_NonExistentTaintRemoval(t *testing.T) { + ctx := context.Background() + nodeName := "test-nonexistent-taint-" + primitive.NewObjectID().Hex()[:8] + + taints := []v1.Taint{ + {Key: "taint1", Value: "val1", Effect: v1.TaintEffectNoSchedule}, + } + + createTestNode(ctx, t, nodeName, nil, nil, taints, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + taintsToRemove := []config.Taint{{Key: "taint-nonexistent", Value: "valX", Effect: "NoSchedule"}} + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, nodeName, taintsToRemove, nil, []string{}, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Filter to test taint only (ignore automatic envtest taints) + var testTaints []v1.Taint + for _, taint := range updatedNode.Spec.Taints { + if taint.Key == "taint1" { + testTaints = append(testTaints, taint) + } + } + // Original taint should remain as we tried to remove a non-existent taint + if len(testTaints) != 1 { + t.Errorf("Expected 1 test taint to remain, got %d", len(testTaints)) + } +} + +func TestUnTaintAndUnCordonNode_NonExistentAnnotationRemoval(t *testing.T) { + ctx := context.Background() + nodeName := "test-nonexistent-annotation-" + primitive.NewObjectID().Hex()[:8] + + annotations := map[string]string{ + "annotation1": "val1", + } + + createTestNode(ctx, t, nodeName, annotations, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + annotationsToRemove := []string{"nonexistent-annotation"} + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, nodeName, nil, annotationsToRemove, []string{}, map[string]string{}) + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + + updatedNode, err := testClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated node: %v", err) + } + + // Original annotation should remain + if updatedNode.Annotations["annotation1"] != "val1" { + t.Errorf("Non-existent annotation removal should not affect existing annotations") + } +} + +func TestTaintAndCordonNode_EmptyTaintKeyOrValue(t *testing.T) { + ctx := context.Background() + nodeName := "test-empty-taint-" + primitive.NewObjectID().Hex()[:8] + + createTestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + // envtest validates taint keys, so empty keys should return an error + taints := []config.Taint{ + {Key: "", Value: "", Effect: "NoSchedule"}, + } + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, nodeName, taints, false, nil, map[string]string{}) + if err == nil { + t.Errorf("Expected error for empty taint key, got nil") + } +} + +func TestTaintAndCordonNode_EmptyAnnotationKey(t *testing.T) { + ctx := context.Background() + nodeName := "test-empty-annotation-key-" + primitive.NewObjectID().Hex()[:8] + + createTestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = testClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + k8sClient := setupTestClient(t) + + // envtest validates annotation keys, so empty keys should return an error + annotations := map[string]string{ + "": "empty-key-value", + } + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, nodeName, nil, false, annotations, map[string]string{}) + if err == nil { + t.Errorf("Expected error for empty annotation key, got nil") + } +} + +func TestTaintAndCordonNode_NonExistentNode(t *testing.T) { + ctx := context.Background() + k8sClient := setupTestClient(t) + + err := k8sClient.QuarantineNodeAndSetAnnotations(ctx, "no-such-node", nil, true, nil, map[string]string{}) + if err == nil { + t.Errorf("Expected error for non-existent node, got nil") + } +} + +func TestUnTaintAndUnCordonNode_NonExistentNode(t *testing.T) { + ctx := context.Background() + k8sClient := setupTestClient(t) + + err := k8sClient.UnQuarantineNodeAndRemoveAnnotations(ctx, "no-such-node", nil, nil, []string{}, map[string]string{}) + if err == nil { + t.Errorf("Expected error for non-existent node, got nil") + } +} diff --git a/fault-quarantine-module/pkg/informer/node_informer.go b/fault-quarantine-module/pkg/informer/node_informer.go index e50d74182..160e198be 100644 --- a/fault-quarantine-module/pkg/informer/node_informer.go +++ b/fault-quarantine-module/pkg/informer/node_informer.go @@ -15,17 +15,13 @@ package informer import ( + "context" "fmt" "log/slog" - "reflect" - "sync" "time" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/nodeinfo" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -34,43 +30,19 @@ import ( ) const ( - // GpuNodeLabel is the label used to identify nodes with GPUs relevant to NVSentinel. - GpuNodeLabel = "nvidia.com/gpu.present" + quarantineAnnotationIndexName = "quarantineAnnotation" ) -// NodeInfoProvider defines the interface for getting node counts. -type NodeInfoProvider interface { - // GetGpuNodeCounts returns the total number of nodes with the GpuNodeLabel - // and the number of those nodes that are currently unschedulable (cordoned). - GetGpuNodeCounts() (totalGpuNodes int, cordonedNodesMap map[string]bool, err error) - // HasSynced returns true if the underlying informer cache has synced. - HasSynced() bool -} - // NodeInformer watches specific nodes and provides counts. type NodeInformer struct { - clientset kubernetes.Interface - informer cache.SharedIndexInformer - lister corelisters.NodeLister - - // Mutex protects access to the counts below - mutex sync.RWMutex - totalGpuNodes int - + clientset kubernetes.Interface + informer cache.SharedIndexInformer + lister corelisters.NodeLister informerSynced cache.InformerSynced - // workSignal is used to notify the reconciler about relevant node changes - workSignal chan struct{} - - // nodeInfo is used to store the node quarantine status - nodeInfo *nodeinfo.NodeInfo - // onQuarantinedNodeDeleted is called when a quarantined node with annotations is deleted onQuarantinedNodeDeleted func(nodeName string) - // onNodeAnnotationsChanged is called when a node's annotations change - onNodeAnnotationsChanged func(nodeName string, annotations map[string]string) - // onManualUncordon is called when a node is manually uncordoned while having FQ annotations onManualUncordon func(nodeName string) error } @@ -80,53 +52,52 @@ func (ni *NodeInformer) Lister() corelisters.NodeLister { return ni.lister } -// NewNodeInformer creates a new NodeInformer focused on nodes with the GpuNodeLabel. -func NewNodeInformer(clientset kubernetes.Interface, - resyncPeriod time.Duration, workSignal chan struct{}, nodeInfo *nodeinfo.NodeInfo) (*NodeInformer, error) { - // Filter nodes based on the presence of the GPU label - gpuNodeSelector := labels.Set{GpuNodeLabel: "true"}.AsSelector() +// GetInformer returns the underlying SharedIndexInformer. +func (ni *NodeInformer) GetInformer() cache.SharedIndexInformer { + return ni.informer +} - tweakListOptions := func(options *metav1.ListOptions) { - options.LabelSelector = gpuNodeSelector.String() +// NewNodeInformer creates a new NodeInformer that watches all nodes. +func NewNodeInformer(clientset kubernetes.Interface, + resyncPeriod time.Duration) (*NodeInformer, error) { + ni := &NodeInformer{ + clientset: clientset, } - // Create an informer factory filtered for the specific label - informerFactory := informers.NewSharedInformerFactoryWithOptions(clientset, resyncPeriod, - informers.WithTweakListOptions(tweakListOptions)) - nodeInformer := informerFactory.Core().V1().Nodes() + informerFactory := informers.NewSharedInformerFactory(clientset, resyncPeriod) - ni := &NodeInformer{ - clientset: clientset, - informer: nodeInformer.Informer(), - lister: nodeInformer.Lister(), - informerSynced: nodeInformer.Informer().HasSynced, - workSignal: workSignal, - nodeInfo: nodeInfo, + nodeInformerObj := informerFactory.Core().V1().Nodes() + ni.informer = nodeInformerObj.Informer() + ni.lister = nodeInformerObj.Lister() + ni.informerSynced = nodeInformerObj.Informer().HasSynced + + err := ni.informer.AddIndexers(cache.Indexers{ + quarantineAnnotationIndexName: quarantineAnnotationIndexFunc, + }) + if err != nil { + return nil, fmt.Errorf("failed to add quarantine annotation indexer: %w", err) } - // Register event handlers - _, err := ni.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + _, err = ni.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: ni.handleAddNode, - UpdateFunc: ni.handleUpdateNode, + UpdateFunc: ni.handleUpdateNodeWrapper, DeleteFunc: ni.handleDeleteNode, }) if err != nil { return nil, fmt.Errorf("failed to add event handler: %w", err) } - slog.Info("NodeInformer created, watching nodes with label", "label", GpuNodeLabel, "value", "true") + slog.Info("NodeInformer created, watching all nodes") return ni, nil } // Run starts the informer and waits for cache sync. func (ni *NodeInformer) Run(stopCh <-chan struct{}) error { - slog.Info("Starting NodeInformer", "label", GpuNodeLabel) + slog.Info("Starting NodeInformer") - // Start the informer goroutine go ni.informer.Run(stopCh) - // Wait for the initial cache synchronization slog.Info("Waiting for NodeInformer cache to sync...") if ok := cache.WaitForCacheSync(stopCh, ni.informerSynced); !ok { @@ -135,12 +106,6 @@ func (ni *NodeInformer) Run(stopCh <-chan struct{}) error { slog.Info("NodeInformer cache synced") - _, err := ni.recalculateCounts() - if err != nil { - // Log the error but allow the informer to continue running - slog.Error("Initial count calculation failed", "error", err) - } - return nil } @@ -149,103 +114,95 @@ func (ni *NodeInformer) HasSynced() bool { return ni.informerSynced() } -// GetGpuNodeCounts returns the current counts of total and unschedulable GPU nodes. -func (ni *NodeInformer) GetGpuNodeCounts() (totalGpuNodes int, cordonedNodesMap map[string]bool, err error) { - if !ni.HasSynced() { - return 0, nil, fmt.Errorf("node informer cache not synced yet") +// WaitForSync waits for the informer cache to sync with context cancellation support. +func (ni *NodeInformer) WaitForSync(ctx context.Context) bool { + slog.Info("Waiting for NodeInformer cache to sync...") + + if ok := cache.WaitForCacheSync(ctx.Done(), ni.informerSynced); !ok { + slog.Warn("NodeInformer cache sync failed or context cancelled") + return false } - ni.mutex.RLock() - defer ni.mutex.RUnlock() + slog.Info("NodeInformer cache synced") - return ni.totalGpuNodes, ni.nodeInfo.GetQuarantinedNodesCopy(), nil + return true } -// hasQuarantineAnnotationsChanged checks if any of the quarantine-related annotations have changed -func hasQuarantineAnnotationsChanged(oldAnnotations, newAnnotations map[string]string) bool { - // List of annotation keys we care about - quarantineKeys := []string{ - common.QuarantineHealthEventAnnotationKey, - common.QuarantineHealthEventAppliedTaintsAnnotationKey, - common.QuarantineHealthEventIsCordonedAnnotationKey, - common.QuarantinedNodeUncordonedManuallyAnnotationKey, +// quarantineAnnotationIndexFunc is the indexer function for quarantined nodes +func quarantineAnnotationIndexFunc(obj interface{}) ([]string, error) { + node, ok := obj.(*v1.Node) + if !ok { + return nil, fmt.Errorf("expected node object, got %T", obj) } - // Check if any of the quarantine annotation values have changed - for _, key := range quarantineKeys { - oldValue := oldAnnotations[key] - newValue := newAnnotations[key] - - if oldValue != newValue { - return true - } + if _, exists := node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey]; exists { + return []string{"quarantined"}, nil } - return false + return []string{}, nil } -// getQuarantineAnnotations extracts only the quarantine-related annotations from a node's annotations -func getQuarantineAnnotations(annotations map[string]string) map[string]string { - quarantineAnnotations := make(map[string]string) +// GetNodeCounts returns the current counts of total nodes and quarantined nodes. +func (ni *NodeInformer) GetNodeCounts() (totalNodes int, quarantinedNodesMap map[string]bool, err error) { + if !ni.HasSynced() { + return 0, nil, fmt.Errorf("node informer cache not synced yet") + } + + allObjs := ni.informer.GetIndexer().List() + total := len(allObjs) - // List of annotation keys we care about - quarantineKeys := []string{ - common.QuarantineHealthEventAnnotationKey, - common.QuarantineHealthEventAppliedTaintsAnnotationKey, - common.QuarantineHealthEventIsCordonedAnnotationKey, - common.QuarantinedNodeUncordonedManuallyAnnotationKey, + quarantinedObjs, err := ni.informer.GetIndexer().ByIndex(quarantineAnnotationIndexName, "quarantined") + if err != nil { + return 0, nil, fmt.Errorf("failed to get quarantined nodes from index: %w", err) } - // Extract only the quarantine annotations - for _, key := range quarantineKeys { - if value, exists := annotations[key]; exists { - quarantineAnnotations[key] = value + quarantinedMap := make(map[string]bool, len(quarantinedObjs)) + + for _, obj := range quarantinedObjs { + if node, ok := obj.(*v1.Node); ok { + quarantinedMap[node.Name] = true } } - return quarantineAnnotations + return total, quarantinedMap, nil +} + +// GetNode retrieves a node from the informer's cache. +func (ni *NodeInformer) GetNode(name string) (*v1.Node, error) { + return ni.lister.Get(name) } -// handleAddNode recalculates counts when a node is added. +// ListNodes lists all nodes from the informer's cache. +func (ni *NodeInformer) ListNodes() ([]*v1.Node, error) { + return ni.lister.List(labels.Everything()) +} + +// handleAddNode logs when a node is added. func (ni *NodeInformer) handleAddNode(obj interface{}) { node, ok := obj.(*v1.Node) if !ok { slog.Error("Add event received unexpected type", "expected", "*v1.Node", - "actualType", reflect.TypeOf(obj)) + "actualType", fmt.Sprintf("%T", obj)) return } slog.Debug("Node added", "node", node.Name) +} - ni.mutex.Lock() - - ni.totalGpuNodes++ - - annotationExist := false - - if !ni.nodeInfo.GetNodeQuarantineStatusCache(node.Name) { - if _, exists := node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey]; exists { - annotationExist = true - } - } - - // Mark as quarantined if the node is unschedulable or has the quarantine annotation - if node.Spec.Unschedulable || annotationExist { - ni.nodeInfo.MarkNodeQuarantineStatusCache(node.Name, true, annotationExist) - } - - ni.mutex.Unlock() +// handleUpdateNodeWrapper is a wrapper for handleUpdateNode that converts interface{} to *v1.Node. +func (ni *NodeInformer) handleUpdateNodeWrapper(oldObj, newObj interface{}) { + oldNode, okOld := oldObj.(*v1.Node) + newNode, okNew := newObj.(*v1.Node) - // Notify about the node's quarantine annotations (including empty ones) - // This ensures all nodes get cached, preventing API calls for clean nodes - if ni.onNodeAnnotationsChanged != nil { - quarantineAnnotations := getQuarantineAnnotations(node.Annotations) - ni.onNodeAnnotationsChanged(node.Name, quarantineAnnotations) + if !okOld || !okNew { + slog.Error("Update event: expected Node objects", + "oldType", fmt.Sprintf("%T", oldObj), "newType", fmt.Sprintf("%T", newObj)) + return } - ni.signalWork() + ni.handleUpdateNode(oldNode, newNode) } // detectAndHandleManualUncordon checks if a node was manually uncordoned and handles it @@ -263,7 +220,6 @@ func (ni *NodeInformer) detectAndHandleManualUncordon(oldNode, newNode *v1.Node) slog.Info("Detected manual uncordon of FQ-quarantined node", "node", newNode.Name) - // Call the manual uncordon handler if registered if ni.onManualUncordon != nil { if err := ni.onManualUncordon(newNode.Name); err != nil { slog.Error("Failed to handle manual uncordon for node", "node", newNode.Name, "error", err) @@ -276,76 +232,9 @@ func (ni *NodeInformer) detectAndHandleManualUncordon(oldNode, newNode *v1.Node) return true } -// handleUpdateNode recalculates counts when a node is updated. -func (ni *NodeInformer) handleUpdateNode(oldObj, newObj interface{}) { - oldNode, okOld := oldObj.(*v1.Node) - - newNode, okNew := newObj.(*v1.Node) - if !okOld || !okNew { - slog.Error("Update event received unexpected type", - "expected", "*v1.Node", - "oldType", reflect.TypeOf(oldObj), - "newType", reflect.TypeOf(newObj)) - - return - } - - // Check if quarantine annotations have changed - quarantineAnnotationsChanged := hasQuarantineAnnotationsChanged(oldNode.Annotations, newNode.Annotations) - - // Check for manual uncordon and handle it - if ni.detectAndHandleManualUncordon(oldNode, newNode) { - // Return early as the manual uncordon handler will take care of everything - return - } - - // Only process if unschedulable status changed or if the quarantine annotation is present. - // the reason it needs to be checked for quarantine annotation is because in dryrun node, - // node is not marked as unschedulable but still annotation will be present, so we need to track those nodes as well. - if oldNode.Spec.Unschedulable != newNode.Spec.Unschedulable || - oldNode.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey] != - newNode.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey] { - slog.Debug("Node updated", - "node", newNode.Name, - "oldUnschedulable", oldNode.Spec.Unschedulable, - "newUnschedulable", newNode.Spec.Unschedulable) - ni.updateNodeQuarantineStatus(newNode) - ni.signalWork() - } else { - slog.Debug("Node update ignored (no relevant change)", "node", newNode.Name) - } - - // Notify about quarantine annotation changes - if quarantineAnnotationsChanged && ni.onNodeAnnotationsChanged != nil { - quarantineAnnotations := getQuarantineAnnotations(newNode.Annotations) - ni.onNodeAnnotationsChanged(newNode.Name, quarantineAnnotations) - } -} - -// updateNodeQuarantineStatus updates the node's quarantine status based on its schedulability -// and returns true if the status was changed -func (ni *NodeInformer) updateNodeQuarantineStatus(node *v1.Node) bool { - ni.mutex.Lock() - defer ni.mutex.Unlock() - - nodeName := node.Name - shouldBeQuarantined := node.Spec.Unschedulable - currentlyQuarantined := ni.nodeInfo.GetNodeQuarantineStatusCache(nodeName) - - // Only update if there's a difference between current and desired state - if currentlyQuarantined != shouldBeQuarantined { - annotationExist := false - - if _, exists := node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey]; exists { - annotationExist = true - } - - ni.nodeInfo.MarkNodeQuarantineStatusCache(nodeName, shouldBeQuarantined, annotationExist) - - return true - } - - return false +// handleUpdateNode detects and handles manual uncordon of quarantined nodes. +func (ni *NodeInformer) handleUpdateNode(oldNode, newNode *v1.Node) { + ni.detectAndHandleManualUncordon(oldNode, newNode) } // SetOnQuarantinedNodeDeletedCallback sets the callback function for when a quarantined node is deleted @@ -353,27 +242,20 @@ func (ni *NodeInformer) SetOnQuarantinedNodeDeletedCallback(callback func(nodeNa ni.onQuarantinedNodeDeleted = callback } -// SetOnNodeAnnotationsChangedCallback sets the callback function for when a node's annotations change -func (ni *NodeInformer) SetOnNodeAnnotationsChangedCallback(callback func(nodeName string, - annotations map[string]string)) { - ni.onNodeAnnotationsChanged = callback -} - // SetOnManualUncordonCallback sets the callback function for when a node is manually uncordoned func (ni *NodeInformer) SetOnManualUncordonCallback(callback func(nodeName string) error) { ni.onManualUncordon = callback } -// handleDeleteNode recalculates counts when a node is deleted. +// handleDeleteNode handles node deletion events. func (ni *NodeInformer) handleDeleteNode(obj interface{}) { node, ok := obj.(*v1.Node) if !ok { - // Handle deletion notifications potentially wrapped in DeletedFinalStateUnknown tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { slog.Error("Delete event received unexpected type", "expected", "*v1.Node or DeletedFinalStateUnknown", - "actualType", reflect.TypeOf(obj)) + "actualType", fmt.Sprintf("%T", obj)) return } @@ -382,7 +264,7 @@ func (ni *NodeInformer) handleDeleteNode(obj interface{}) { if !ok { slog.Error("Delete event tombstone contained unexpected type", "expected", "*v1.Node", - "actualType", reflect.TypeOf(tombstone.Obj)) + "actualType", fmt.Sprintf("%T", tombstone.Obj)) return } @@ -390,96 +272,11 @@ func (ni *NodeInformer) handleDeleteNode(obj interface{}) { slog.Info("Node deleted", "node", node.Name) - ni.mutex.Lock() - - // Check if the node was quarantined and had the quarantine annotation - hadQuarantineAnnotation := false - - if ni.nodeInfo.GetNodeQuarantineStatusCache(node.Name) { - if _, exists := node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey]; exists { - hadQuarantineAnnotation = true - } - - // Update the cache and delete the node name from the map if the annotation is present - ni.nodeInfo.MarkNodeQuarantineStatusCache(node.Name, false, false) - } - - // handle a case where a node is cordoned but not by nvsentinel, then if its entry is there in the cache, - // we need to remove it - if node.Spec.Unschedulable { - ni.nodeInfo.MarkNodeQuarantineStatusCache(node.Name, false, false) - } - - ni.totalGpuNodes-- - ni.mutex.Unlock() + _, hadQuarantineAnnotation := node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey] // If the node was quarantined and had the annotation, call the callback so that // currentQuarantinedNodes metric is decremented if hadQuarantineAnnotation && ni.onQuarantinedNodeDeleted != nil { ni.onQuarantinedNodeDeleted(node.Name) } - - // Notify about node deletion to clear from annotations cache - if ni.onNodeAnnotationsChanged != nil { - // Pass nil to indicate the node has been deleted - ni.onNodeAnnotationsChanged(node.Name, nil) - } - - ni.signalWork() -} - -// recalculateCounts lists all relevant nodes from the cache and updates the counts. -// It returns true if the counts changed, false otherwise. -func (ni *NodeInformer) recalculateCounts() (bool, error) { - // Use List with Everything selector as the lister is already filtered by the factory - nodes, err := ni.lister.List(labels.Everything()) - if err != nil { - return false, fmt.Errorf("failed to list nodes from informer cache: %w", err) - } - - total := 0 - unschedulable := 0 - - for _, node := range nodes { - // Double-check the label, although the informer should only list matching nodes - if _, exists := node.Labels[GpuNodeLabel]; exists { - total++ - - if node.Spec.Unschedulable { - ni.nodeInfo.MarkNodeQuarantineStatusCache(node.Name, true, false) - - unschedulable++ - } - } else { - slog.Warn("Node %s found in informer cache despite missing label %s", node.Name, GpuNodeLabel) - } - } - - ni.mutex.Lock() - quarantinedCount := ni.nodeInfo.GetQuarantinedNodesCount() - changed := ni.totalGpuNodes != total || quarantinedCount != unschedulable - ni.totalGpuNodes = total - ni.mutex.Unlock() - - if changed { - slog.Debug("Node counts updated", "totalGpuNodes", total, "unschedulableGpuNodes", unschedulable) - } else { - slog.Debug("Node counts recalculated, no change", "totalGpuNodes", total, "unschedulableGpuNodes", unschedulable) - } - - return changed, nil -} - -// signalWork sends a non-blocking signal to the reconciler's work channel. -func (ni *NodeInformer) signalWork() { - if ni.workSignal == nil { - slog.Error("No channel configured for node informer", "nodeInformer", ni) - return // No channel configured - } - select { - case ni.workSignal <- struct{}{}: - slog.Debug("Signalled work channel due to node change.") - default: - slog.Debug("Work channel already signalled, skipping signal for node change.") - } } diff --git a/fault-quarantine-module/pkg/informer/node_informer_test.go b/fault-quarantine-module/pkg/informer/node_informer_test.go deleted file mode 100644 index 31d80c543..000000000 --- a/fault-quarantine-module/pkg/informer/node_informer_test.go +++ /dev/null @@ -1,954 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package informer - -import ( - "context" - "fmt" - "sync" - "testing" - "time" - - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/nodeinfo" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/cache" -) - -// Helper function to create a node object -func newNode(name string, labels map[string]string, unschedulable bool) *v1.Node { - return &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Labels: labels, - }, - Spec: v1.NodeSpec{ - Unschedulable: unschedulable, - }, - } -} - -// Helper function to create a GPU node object -func newGpuNode(name string, unschedulable bool) *v1.Node { - return newNode(name, map[string]string{GpuNodeLabel: "true"}, unschedulable) -} - -func TestNewNodeInformer(t *testing.T) { - clientset := fake.NewSimpleClientset() - workSignal := make(chan struct{}, 1) // Buffered channel - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) // 0 resync period for tests - - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - if ni == nil { - t.Fatal("NewNodeInformer returned nil informer") - } - if ni.clientset != clientset { - t.Error("Clientset not stored correctly") - } - if ni.informer == nil { - t.Error("Informer not created") - } - if ni.lister == nil { - t.Error("Lister not created") - } - if ni.informerSynced == nil { - t.Error("InformerSynced function not set") - } - if ni.workSignal != workSignal { - t.Error("WorkSignal channel not stored correctly") - } -} - -// waitForSync waits for the informer cache to sync or times out. -func waitForSync(t *testing.T, stopCh chan struct{}, informerSynced cache.InformerSynced) { - t.Helper() - syncCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) // Timeout for sync - defer cancel() - - if !cache.WaitForCacheSync(syncCtx.Done(), informerSynced) { - t.Fatal("Timed out waiting for caches to sync") - } -} - -// safeReceiveSignal waits for a signal with a timeout to prevent test hangs -func safeReceiveSignal(t *testing.T, workSignal chan struct{}, expectSignal bool) bool { - t.Helper() - - select { - case <-workSignal: - if !expectSignal { - t.Log("Received unexpected signal") - } - return true - case <-time.After(500 * time.Millisecond): - if expectSignal { - t.Error("Expected signal but none received within timeout") - } - return false - } -} - -func TestNodeInformer_RunAndSync(t *testing.T) { - clientset := fake.NewSimpleClientset(newGpuNode("gpu-node-1", false)) - workSignal := make(chan struct{}, 1) - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - stopCh := make(chan struct{}) - defer close(stopCh) - - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - - var runErr error // Variable to store error from the Run goroutine - var runErrMu sync.Mutex // Mutex to protect runErr - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - localErr := ni.Run(stopCh) // Use a local variable inside goroutine - if localErr != nil { - runErrMu.Lock() - runErr = localErr // Assign protected by mutex - runErrMu.Unlock() - } - }() - - // Wait for sync completion which happens inside Run - waitForSync(t, stopCh, ni.informerSynced) - - if !ni.HasSynced() { - t.Error("Expected HasSynced to be true after Run completed sync") - } - - // Check initial counts after sync - total, unschedulableMap, err := ni.GetGpuNodeCounts() - if err != nil { - t.Errorf("GetGpuNodeCounts failed after sync: %v", err) - } - if total != 1 { - t.Errorf("Expected 1 total GPU node after sync, got %d", total) - } - if len(unschedulableMap) != 0 { - t.Errorf("Expected 0 unschedulable GPU nodes after sync, got %d", len(unschedulableMap)) - } - - // Stop the informer and wait for Run goroutine to exit - // The deferred close(stopCh) will signal the Run goroutine to stop. - // wg.Wait() ensures we wait for the Run goroutine to finish processing the stop signal. - wg.Wait() - - runErrMu.Lock() // Lock before reading runErr - finalRunErr := runErr // Read protected by mutex - runErrMu.Unlock() // Unlock after reading - - if finalRunErr != nil { - // We expect nil error on clean shutdown, potentially error if sync failed before shutdown - // Allow the specific sync error in case waitForSync timed out but Run exited cleanly later - if finalRunErr.Error() != "failed to wait for caches to sync" { - t.Errorf("ni.Run returned unexpected error: %v", finalRunErr) - } - } -} - -func TestNodeInformer_GetGpuNodeCounts_NotSynced(t *testing.T) { - clientset := fake.NewSimpleClientset() - workSignal := make(chan struct{}, 1) - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - - // Don't run the informer, so it won't be synced - _, _, err = ni.GetGpuNodeCounts() - if err == nil { - t.Error("Expected error when getting counts before cache sync, got nil") - } else if err.Error() != "node informer cache not synced yet" { - t.Errorf("Expected specific 'not synced' error, got: %v", err) - } -} - -func TestNodeInformer_EventHandlers(t *testing.T) { - clientset := fake.NewSimpleClientset() // Start with no nodes - workSignal := make(chan struct{}, 10) - stopCh := make(chan struct{}) - defer close(stopCh) - - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - - // Need access to the informer's store to add/update/delete objects directly - store := ni.informer.GetStore() - - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - // Run blocks until sync or stopCh is closed - _ = ni.Run(stopCh) - }() - - // Wait for initial sync - waitForSync(t, stopCh, ni.informerSynced) - t.Log("Initial sync complete") - - // Drain any initial signals from cache sync - drainSignals := func() { - for { - select { - case <-workSignal: - // Keep draining - default: - return - } - } - } - drainSignals() - - // Check initial state (0 nodes) - total, unschedulableMap, err := ni.GetGpuNodeCounts() - if err != nil { - t.Fatalf("GetGpuNodeCounts failed after initial sync: %v", err) - } - if total != 0 || len(unschedulableMap) != 0 { - t.Fatalf("Expected 0 nodes initially, got total=%d, unschedulable=%d", total, len(unschedulableMap)) - } - - // --- Test Add --- - node1 := newGpuNode("gpu-node-1", false) - t.Logf("Adding node: %s", node1.Name) - err = store.Add(node1) - if err != nil { - t.Fatalf("Failed to add node1 to store: %v", err) - } - ni.handleAddNode(node1) // Manually trigger handler - safeReceiveSignal(t, workSignal, true) // Expect signal, with timeout - drainSignals() // Drain any extra signals - total, unschedulableMap, err = ni.GetGpuNodeCounts() - if err != nil || total != 1 || len(unschedulableMap) != 0 { - t.Errorf("After adding node1: expected total=1, unschedulable=0, err=nil; got total=%d, unschedulable=%d, err=%v", total, len(unschedulableMap), err) - } - // --- Test Update (Cordon) --- - node1Cordoned := newGpuNode("gpu-node-1", true) // Same node, now unschedulable - t.Logf("Updating node: %s (cordon)", node1.Name) - err = store.Update(node1Cordoned) - if err != nil { - t.Fatalf("Failed to update node1 in store: %v", err) - } - ni.handleUpdateNode(node1, node1Cordoned) // Manually trigger handler - safeReceiveSignal(t, workSignal, true) // Expect signal, with timeout - drainSignals() - total, unschedulableMap, err = ni.GetGpuNodeCounts() - if err != nil || total != 1 || len(unschedulableMap) != 1 { - t.Errorf("After cordoning node1: expected total=1, unschedulable=1, err=nil; got total=%d, unschedulable=%d, err=%v", total, len(unschedulableMap), err) - } - - // --- Test Update (No relevant change) --- - node1CordonedUpdated := node1Cordoned.DeepCopy() - node1CordonedUpdated.Annotations = map[string]string{"new": "annotation"} // Change something irrelevant - t.Logf("Updating node: %s (irrelevant change)", node1Cordoned.Name) - err = store.Update(node1CordonedUpdated) - if err != nil { - t.Fatalf("Failed to update node1 again in store: %v", err) - } - ni.handleUpdateNode(node1Cordoned, node1CordonedUpdated) // Manually trigger handler - drainSignals() - // No signal expected for irrelevant updates - safeReceiveSignal(t, workSignal, false) // Don't expect signal, with timeout - total, unschedulableMap, err = ni.GetGpuNodeCounts() - if err != nil || total != 1 || len(unschedulableMap) != 1 { - t.Errorf("After irrelevant update node1: expected total=1, unschedulable=1, err=nil; got total=%d, unschedulable=%d, err=%v", total, len(unschedulableMap), err) - } - - // --- Test Delete --- - t.Logf("Deleting node: %s", node1CordonedUpdated.Name) - err = store.Delete(node1CordonedUpdated) - if err != nil { - t.Fatalf("Failed to delete node1 from store: %v", err) - } - ni.handleDeleteNode(node1CordonedUpdated) // Manually trigger handler - safeReceiveSignal(t, workSignal, true) // Expect signal, with timeout - drainSignals() - - total, unschedulableMap, err = ni.GetGpuNodeCounts() - if err != nil || total != 0 || len(unschedulableMap) != 0 { - t.Errorf("After deleting node1: expected total=0, unschedulable=0, err=nil; got total=%d, unschedulable=%d, err=%v", total, len(unschedulableMap), err) - } - - // --- Test Delete (Tombstone) --- - node2 := newGpuNode("gpu-node-2", true) - t.Logf("Adding node: %s", node2.Name) - err = store.Add(node2) - if err != nil { - t.Fatalf("Failed to add node2 to store: %v", err) - } - ni.handleAddNode(node2) // Manually trigger handler - safeReceiveSignal(t, workSignal, true) // Expect signal, with timeout - drainSignals() - - // Verify node2 was added - total, unschedulableMap, err = ni.GetGpuNodeCounts() - if err != nil || total != 1 || len(unschedulableMap) != 1 { - t.Errorf("After adding node2: expected total=1, unschedulable=1, err=nil; got total=%d, unschedulable=%d, err=%v", total, len(unschedulableMap), err) - } - - t.Logf("Deleting node with tombstone: %s", node2.Name) - err = store.Delete(node2) - if err != nil { - t.Fatalf("Failed to delete node2 from store: %v", err) - } - tombstone := cache.DeletedFinalStateUnknown{Key: "default/gpu-node-2", Obj: node2} - ni.handleDeleteNode(tombstone) // Trigger handler with tombstone - safeReceiveSignal(t, workSignal, true) // Expect signal, with timeout - drainSignals() - total, unschedulableMap, err = ni.GetGpuNodeCounts() - if err != nil || total != 0 || len(unschedulableMap) != 0 { - t.Errorf("After deleting node2 via tombstone: expected total=0, unschedulable=0, err=nil; got total=%d, unschedulable=%d, err=%v", total, len(unschedulableMap), err) - } - - // The deferred close(stopCh) will signal the Run goroutine to stop. - wg.Wait() -} - -func TestNodeInformer_RecalculateCounts(t *testing.T) { - clientset := fake.NewSimpleClientset() - workSignal := make(chan struct{}, 1) - stopCh := make(chan struct{}) - defer close(stopCh) - - // Pre-populate nodes directly (won't trigger handlers) - node1 := newGpuNode("gpu-node-1", true) - node2 := newGpuNode("gpu-node-2", true) - node3 := newGpuNode("gpu-node-3", false) - - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - - // Manually add nodes to the informer's store - ni.informer.GetStore().Add(node1) - ni.informer.GetStore().Add(node2) - ni.informer.GetStore().Add(node3) - - // Manually mark the nodes as quarantined in the nodeInfo cache - // since the handleAddNode isn't being called - nodeInfo.MarkNodeQuarantineStatusCache("gpu-node-1", true, true) - nodeInfo.MarkNodeQuarantineStatusCache("gpu-node-2", true, true) - nodeInfo.MarkNodeQuarantineStatusCache("gpu-node-3", false, true) - - // Run recalculate directly - _, err = ni.recalculateCounts() // Assign both bool and error, ignore bool - if err != nil { - t.Fatalf("recalculateCounts failed: %v", err) - } - - // Check internal counts directly - ni.mutex.RLock() - total := ni.totalGpuNodes - ni.mutex.RUnlock() - unschedulable := ni.nodeInfo.GetQuarantinedNodesCount() - - if total != 3 { - t.Errorf("Expected totalGpuNodes=3, got %d", total) - } - if unschedulable != 2 { - t.Errorf("Expected unschedulableGpuNodes=2, got %d", unschedulable) - } -} - -func TestNodeInformer_SignalWork(t *testing.T) { - // Test signal sent - workSignal := make(chan struct{}, 1) - ni := &NodeInformer{workSignal: workSignal} - ni.signalWork() - select { - case <-workSignal: - // Expected path - case <-time.After(100 * time.Millisecond): - t.Error("Timed out waiting for work signal") - } - - // Test non-blocking behavior (channel full) - ni.signalWork() // Should not block - select { - case workSignal <- struct{}{}: - t.Error("Should not have been able to send to already full channel") - default: - // Expected path, signal was dropped - } - - // Test nil channel - niNil := &NodeInformer{workSignal: nil} - // Should not panic - niNil.signalWork() -} - -func TestNodeInformer_OnQuarantinedNodeDeletedCallback(t *testing.T) { - clientset := fake.NewSimpleClientset() - workSignal := make(chan struct{}, 1) - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - - // Set up callback to track calls - var callbackCalled bool - var callbackNodeName string - ni.SetOnQuarantinedNodeDeletedCallback(func(nodeName string) { - callbackCalled = true - callbackNodeName = nodeName - }) - - // Test 1: Delete a quarantined node with annotation - callback should be called - node1 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-node-1", - Labels: map[string]string{GpuNodeLabel: "true"}, - Annotations: map[string]string{ - common.QuarantineHealthEventIsCordonedAnnotationKey: common.QuarantineHealthEventIsCordonedAnnotationValueTrue, - }, - }, - Spec: v1.NodeSpec{ - Unschedulable: true, - }, - } - - // Simulate the node being in quarantine cache - nodeInfo.MarkNodeQuarantineStatusCache("gpu-node-1", true, true) - - // Handle delete - ni.handleDeleteNode(node1) - - if !callbackCalled { - t.Error("Expected callback to be called for quarantined node with annotation") - } - if callbackNodeName != "gpu-node-1" { - t.Errorf("Expected callback node name to be gpu-node-1, got %s", callbackNodeName) - } - - // Test 2: Delete a quarantined node without annotation - callback should NOT be called - callbackCalled = false - callbackNodeName = "" - - node2 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-node-2", - Labels: map[string]string{GpuNodeLabel: "true"}, - }, - Spec: v1.NodeSpec{ - Unschedulable: true, - }, - } - - // Simulate the node being in quarantine cache but without annotation - nodeInfo.MarkNodeQuarantineStatusCache("gpu-node-2", true, false) - - // Handle delete - ni.handleDeleteNode(node2) - - // Test 3: Delete a non-quarantined node - callback should NOT be called - node3 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-node-3", - Labels: map[string]string{GpuNodeLabel: "true"}, - }, - Spec: v1.NodeSpec{ - Unschedulable: false, - }, - } - - // Handle delete (node not in quarantine cache) - ni.handleDeleteNode(node3) - - if callbackCalled { - t.Error("Expected callback NOT to be called for non-quarantined node") - } -} - -func TestNodeInformer_OnNodeAnnotationsChangedCallback(t *testing.T) { - clientset := fake.NewSimpleClientset() - workSignal := make(chan struct{}, 1) - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - - // Set up callback to track calls - var callbackCalls []struct { - nodeName string - annotations map[string]string - } - var mu sync.Mutex - - ni.SetOnNodeAnnotationsChangedCallback(func(nodeName string, annotations map[string]string) { - mu.Lock() - defer mu.Unlock() - // Make a copy of annotations to avoid race conditions - annotationsCopy := make(map[string]string) - for k, v := range annotations { - annotationsCopy[k] = v - } - callbackCalls = append(callbackCalls, struct { - nodeName string - annotations map[string]string - }{nodeName: nodeName, annotations: annotationsCopy}) - }) - - // Test 1: Add a node with quarantine annotations - callback should be called - node1 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-node-1", - Labels: map[string]string{GpuNodeLabel: "true"}, - Annotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data", - "other-annotation": "should-be-ignored", - }, - }, - } - - ni.handleAddNode(node1) - - // Check callback was called - mu.Lock() - if len(callbackCalls) != 1 { - t.Errorf("Expected 1 callback call after add, got %d", len(callbackCalls)) - } else { - call := callbackCalls[0] - if call.nodeName != "gpu-node-1" { - t.Errorf("Expected node name gpu-node-1, got %s", call.nodeName) - } - if len(call.annotations) != 1 { - t.Errorf("Expected 1 annotation, got %d", len(call.annotations)) - } - if call.annotations[common.QuarantineHealthEventAnnotationKey] != "event-data" { - t.Errorf("Expected quarantine annotation value 'event-data', got %s", - call.annotations[common.QuarantineHealthEventAnnotationKey]) - } - if _, exists := call.annotations["other-annotation"]; exists { - t.Error("Non-quarantine annotation should not be included") - } - } - mu.Unlock() - - // Test 2: Add a node without quarantine annotations - callback should be called with empty map - node2 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-node-2", - Labels: map[string]string{GpuNodeLabel: "true"}, - Annotations: map[string]string{ - "other-annotation": "value", - }, - }, - } - - ni.handleAddNode(node2) - - mu.Lock() - if len(callbackCalls) != 2 { - t.Errorf("Expected 2 callback calls (including empty annotations), got %d", len(callbackCalls)) - } else { - call := callbackCalls[1] - if call.nodeName != "gpu-node-2" { - t.Errorf("Expected node name gpu-node-2, got %s", call.nodeName) - } - if len(call.annotations) != 0 { - t.Errorf("Expected 0 quarantine annotations for clean node, got %d", len(call.annotations)) - } - } - mu.Unlock() - - // Test 3: Update node with changed quarantine annotations - callback should be called - node1Updated := node1.DeepCopy() - node1Updated.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey] = common.QuarantineHealthEventIsCordonedAnnotationValueTrue - node1Updated.Annotations["other-annotation"] = "new-value" // This change should be ignored - - ni.handleUpdateNode(node1, node1Updated) - - mu.Lock() - if len(callbackCalls) != 3 { - t.Errorf("Expected 3 callback calls after update, got %d", len(callbackCalls)) - } else { - call := callbackCalls[2] - if call.nodeName != "gpu-node-1" { - t.Errorf("Expected node name gpu-node-1, got %s", call.nodeName) - } - if len(call.annotations) != 2 { - t.Errorf("Expected 2 annotations, got %d", len(call.annotations)) - } - if call.annotations[common.QuarantineHealthEventAnnotationKey] != "event-data" { - t.Errorf("Expected quarantine annotation value 'event-data', got %s", - call.annotations[common.QuarantineHealthEventAnnotationKey]) - } - if call.annotations[common.QuarantineHealthEventIsCordonedAnnotationKey] != common.QuarantineHealthEventIsCordonedAnnotationValueTrue { - t.Errorf("Expected cordoned annotation value 'True', got %s", - call.annotations[common.QuarantineHealthEventIsCordonedAnnotationKey]) - } - } - mu.Unlock() - - // Test 4: Update node with only non-quarantine annotation changes - callback should NOT be called - node1UpdatedAgain := node1Updated.DeepCopy() - node1UpdatedAgain.Annotations["other-annotation"] = "another-value" - node1UpdatedAgain.Spec.Unschedulable = true // Change something else - - ni.handleUpdateNode(node1Updated, node1UpdatedAgain) - - mu.Lock() - if len(callbackCalls) != 3 { - t.Errorf("Expected still 3 callback calls (no new call for non-quarantine changes), got %d", len(callbackCalls)) - } - mu.Unlock() - - // Test 5: Delete node - callback should be called with nil annotations - ni.handleDeleteNode(node1UpdatedAgain) - - mu.Lock() - if len(callbackCalls) != 4 { - t.Errorf("Expected 4 callback calls after delete, got %d", len(callbackCalls)) - } else { - call := callbackCalls[3] - if call.nodeName != "gpu-node-1" { - t.Errorf("Expected node name gpu-node-1, got %s", call.nodeName) - } - if len(call.annotations) > 0 { - t.Errorf("Expected nil or empty annotations for deleted node, got %v", call.annotations) - } - } - mu.Unlock() - - // Test 6: Update to remove quarantine annotations - callback should be called - node3 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-node-3", - Labels: map[string]string{GpuNodeLabel: "true"}, - Annotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data", - }, - }, - } - node3Updated := node3.DeepCopy() - delete(node3Updated.Annotations, common.QuarantineHealthEventAnnotationKey) - - // Reset callback calls - mu.Lock() - callbackCalls = callbackCalls[:0] - mu.Unlock() - - ni.handleAddNode(node3) - ni.handleUpdateNode(node3, node3Updated) - - mu.Lock() - if len(callbackCalls) != 2 { - t.Errorf("Expected 2 callback calls (add + update with annotation removal), got %d", len(callbackCalls)) - } else { - // First call should have the annotation - if len(callbackCalls[0].annotations) != 1 { - t.Errorf("Expected 1 annotation in first call, got %d", len(callbackCalls[0].annotations)) - } - // Second call should have empty annotations (all quarantine annotations removed) - if len(callbackCalls[1].annotations) != 0 { - t.Errorf("Expected 0 annotations in second call, got %d", len(callbackCalls[1].annotations)) - } - } - mu.Unlock() -} - -// Test edge cases for quarantine annotation tracking -func TestHasQuarantineAnnotationsChanged_EdgeCases(t *testing.T) { - tests := []struct { - name string - oldAnnotations map[string]string - newAnnotations map[string]string - expectChanged bool - }{ - { - name: "nil to nil annotations", - oldAnnotations: nil, - newAnnotations: nil, - expectChanged: false, - }, - { - name: "nil to empty annotations", - oldAnnotations: nil, - newAnnotations: map[string]string{}, - expectChanged: false, - }, - { - name: "empty to nil annotations", - oldAnnotations: map[string]string{}, - newAnnotations: nil, - expectChanged: false, - }, - { - name: "nil to quarantine annotations", - oldAnnotations: nil, - newAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data", - }, - expectChanged: true, - }, - { - name: "quarantine annotations to nil", - oldAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data", - }, - newAnnotations: nil, - expectChanged: true, - }, - { - name: "only non-quarantine annotations change", - oldAnnotations: map[string]string{ - "other-annotation": "value1", - common.QuarantineHealthEventAnnotationKey: "event-data", - }, - newAnnotations: map[string]string{ - "other-annotation": "value2", - common.QuarantineHealthEventAnnotationKey: "event-data", - }, - expectChanged: false, - }, - { - name: "quarantine annotation value changes", - oldAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data-1", - }, - newAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data-2", - }, - expectChanged: true, - }, - { - name: "multiple quarantine annotations, one changes", - oldAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data", - common.QuarantineHealthEventIsCordonedAnnotationKey: "True", - common.QuarantineHealthEventAppliedTaintsAnnotationKey: "[taint1]", - }, - newAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event-data", - common.QuarantineHealthEventIsCordonedAnnotationKey: "False", // changed - common.QuarantineHealthEventAppliedTaintsAnnotationKey: "[taint1]", - }, - expectChanged: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - changed := hasQuarantineAnnotationsChanged(tt.oldAnnotations, tt.newAnnotations) - if changed != tt.expectChanged { - t.Errorf("hasQuarantineAnnotationsChanged() = %v, want %v", changed, tt.expectChanged) - } - }) - } -} - -// Test race conditions in annotation change callbacks -func TestNodeInformer_AnnotationCallbackRaceCondition(t *testing.T) { - clientset := fake.NewSimpleClientset() - workSignal := make(chan struct{}, 100) // larger buffer for concurrent events - nodeInfo := nodeinfo.NewNodeInfo(workSignal) - - ni, err := NewNodeInformer(clientset, 0, workSignal, nodeInfo) - if err != nil { - t.Fatalf("NewNodeInformer failed: %v", err) - } - - // Track callback invocations with thread-safe access - var mu sync.Mutex - callbackInvocations := make(map[string][]map[string]string) - - ni.SetOnNodeAnnotationsChangedCallback(func(nodeName string, annotations map[string]string) { - mu.Lock() - defer mu.Unlock() - - // Deep copy annotations to avoid race conditions - annotationsCopy := make(map[string]string) - for k, v := range annotations { - annotationsCopy[k] = v - } - - callbackInvocations[nodeName] = append(callbackInvocations[nodeName], annotationsCopy) - }) - - // Create multiple goroutines that concurrently update nodes - var wg sync.WaitGroup - nodeCount := 10 - updateCount := 5 - - for i := 0; i < nodeCount; i++ { - nodeName := fmt.Sprintf("node-%d", i) - - // Initial node - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{GpuNodeLabel: "true"}, - }, - } - - wg.Add(1) - go func(n *v1.Node) { - defer wg.Done() - - // Add node - ni.handleAddNode(n) - - // Perform multiple concurrent updates - for j := 0; j < updateCount; j++ { - oldNode := n.DeepCopy() - newNode := n.DeepCopy() - - // Alternate between adding and removing quarantine annotations - if j%2 == 0 { - newNode.Annotations = map[string]string{ - common.QuarantineHealthEventAnnotationKey: fmt.Sprintf("event-%d", j), - } - } else { - newNode.Annotations = map[string]string{} - } - - ni.handleUpdateNode(oldNode, newNode) - n = newNode - } - - // Delete node - ni.handleDeleteNode(n) - }(node) - } - - wg.Wait() - - // Verify callback was called correct number of times for each node - mu.Lock() - defer mu.Unlock() - - for i := 0; i < nodeCount; i++ { - nodeName := fmt.Sprintf("node-%d", i) - invocations := callbackInvocations[nodeName] - - // Expected: 1 add + updateCount updates + 1 delete - expectedCallbacks := 1 + updateCount + 1 - - if len(invocations) != expectedCallbacks { - t.Errorf("Node %s: expected %d callbacks, got %d", nodeName, expectedCallbacks, len(invocations)) - } - - // Verify last invocation is for deletion (nil or empty map) - lastInvocation := invocations[len(invocations)-1] - if len(lastInvocation) > 0 { - t.Errorf("Node %s: expected last callback to be for deletion (empty annotations), got %v", - nodeName, lastInvocation) - } - } -} - -// Test that getQuarantineAnnotations filters correctly -func TestGetQuarantineAnnotations(t *testing.T) { - tests := []struct { - name string - allAnnotations map[string]string - expectedResult map[string]string - }{ - { - name: "nil annotations", - allAnnotations: nil, - expectedResult: map[string]string{}, - }, - { - name: "empty annotations", - allAnnotations: map[string]string{}, - expectedResult: map[string]string{}, - }, - { - name: "only quarantine annotations", - allAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event1", - common.QuarantineHealthEventIsCordonedAnnotationKey: "True", - common.QuarantineHealthEventAppliedTaintsAnnotationKey: "[taint]", - }, - expectedResult: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event1", - common.QuarantineHealthEventIsCordonedAnnotationKey: "True", - common.QuarantineHealthEventAppliedTaintsAnnotationKey: "[taint]", - }, - }, - { - name: "mixed annotations", - allAnnotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event1", - "kubernetes.io/some-annotation": "value", - "custom-annotation": "custom-value", - common.QuarantineHealthEventIsCordonedAnnotationKey: "True", - }, - expectedResult: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event1", - common.QuarantineHealthEventIsCordonedAnnotationKey: "True", - }, - }, - { - name: "no quarantine annotations", - allAnnotations: map[string]string{ - "kubernetes.io/annotation1": "value1", - "custom-annotation": "value2", - }, - expectedResult: map[string]string{}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := getQuarantineAnnotations(tt.allAnnotations) - - if len(result) != len(tt.expectedResult) { - t.Errorf("Expected %d quarantine annotations, got %d", - len(tt.expectedResult), len(result)) - } - - for key, expectedValue := range tt.expectedResult { - if actualValue, exists := result[key]; !exists { - t.Errorf("Expected annotation %s not found in result", key) - } else if actualValue != expectedValue { - t.Errorf("Annotation %s: expected value %s, got %s", - key, expectedValue, actualValue) - } - } - - // Ensure no extra annotations - for key := range result { - if _, expected := tt.expectedResult[key]; !expected { - t.Errorf("Unexpected annotation %s in result", key) - } - } - }) - } -} diff --git a/fault-quarantine-module/pkg/initializer/init.go b/fault-quarantine-module/pkg/initializer/init.go new file mode 100644 index 000000000..48a7866b3 --- /dev/null +++ b/fault-quarantine-module/pkg/initializer/init.go @@ -0,0 +1,312 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package initializer + +import ( + "context" + "fmt" + "log/slog" + "path/filepath" + "time" + + "github.com/nvidia/nvsentinel/commons/pkg/configmanager" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/breaker" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/informer" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/mongodb" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/reconciler" + "github.com/nvidia/nvsentinel/store-client-sdk/pkg/storewatcher" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +type InitializationParams struct { + MongoClientCertMountPath string + KubeconfigPath string + TomlConfigPath string + DryRun bool + CircuitBreakerPercentage int + CircuitBreakerDuration time.Duration + CircuitBreakerEnabled bool +} + +type Components struct { + Reconciler *reconciler.Reconciler + EventWatcher *mongodb.EventWatcher + K8sClient *informer.FaultQuarantineClient + CircuitBreaker breaker.CircuitBreaker +} + +type EnvConfig struct { + Namespace string + MongoURI string + MongoDatabase string + MongoCollection string + TokenDatabase string + TokenCollection string + TotalTimeoutSeconds int + IntervalSeconds int + TotalCACertTimeoutSeconds int + IntervalCACertSeconds int + UnprocessedEventsMetricUpdateIntervalSeconds int +} + +func InitializeAll(ctx context.Context, params InitializationParams) (*Components, error) { + slog.Info("Starting fault quarantine module initialization") + + envConfig, err := loadEnvConfig() + if err != nil { + return nil, fmt.Errorf("failed to load environment configuration: %w", err) + } + + mongoConfig := createMongoConfig(envConfig, params.MongoClientCertMountPath) + tokenConfig := createTokenConfig(envConfig) + pipeline := createMongoPipeline() + + var tomlCfg config.TomlConfig + if err := configmanager.LoadTOMLConfig(params.TomlConfigPath, &tomlCfg); err != nil { + return nil, fmt.Errorf("error while loading the toml config: %w", err) + } + + if params.DryRun { + slog.Info("Running in dry-run mode") + } + + k8sClient, err := informer.NewFaultQuarantineClient(params.KubeconfigPath, params.DryRun, 30*time.Minute) + if err != nil { + return nil, fmt.Errorf("error while initializing kubernetes client: %w", err) + } + + slog.Info("Successfully initialized kubernetes client with embedded node informer") + + var circuitBreaker breaker.CircuitBreaker + + if params.CircuitBreakerEnabled { + cb, err := initializeCircuitBreaker( + ctx, + k8sClient, + envConfig.Namespace, + params.CircuitBreakerPercentage, + params.CircuitBreakerDuration, + ) + if err != nil { + return nil, fmt.Errorf("error while initializing circuit breaker: %w", err) + } + + circuitBreaker = cb + + slog.Info("Successfully initialized circuit breaker") + } else { + slog.Info("Circuit breaker is disabled, skipping initialization") + } + + reconcilerCfg := createReconcilerConfig( + tomlCfg, + params.DryRun, + params.CircuitBreakerEnabled, + ) + + reconcilerInstance := reconciler.NewReconciler( + reconcilerCfg, + k8sClient, + circuitBreaker, + ) + + healthEventCollection, err := initializeMongoCollection(ctx, mongoConfig) + if err != nil { + return nil, fmt.Errorf("error while initializing mongo collection: %w", err) + } + + eventWatcher := mongodb.NewEventWatcher( + mongoConfig, + tokenConfig, + pipeline, + healthEventCollection, + time.Duration(envConfig.UnprocessedEventsMetricUpdateIntervalSeconds)*time.Second, + reconcilerInstance, + ) + + reconcilerInstance.SetEventWatcher(eventWatcher) + + slog.Info("Initialization completed successfully") + + return &Components{ + Reconciler: reconcilerInstance, + EventWatcher: eventWatcher, + K8sClient: k8sClient, + CircuitBreaker: circuitBreaker, + }, nil +} + +func loadEnvConfig() (*EnvConfig, error) { + envSpecs := []configmanager.EnvVarSpec{ + {Name: "POD_NAMESPACE"}, + {Name: "MONGODB_URI"}, + {Name: "MONGODB_DATABASE_NAME"}, + {Name: "MONGODB_COLLECTION_NAME"}, + {Name: "MONGODB_TOKEN_COLLECTION_NAME"}, + } + + envVars, envErrors := configmanager.ReadEnvVars(envSpecs) + if len(envErrors) > 0 { + for _, err := range envErrors { + slog.Error("Environment variable error", "error", err) + } + + return nil, fmt.Errorf("required environment variables are missing") + } + + totalTimeoutSeconds, err := getPositiveIntEnvVar("MONGODB_PING_TIMEOUT_TOTAL_SECONDS", 300) + if err != nil { + return nil, err + } + + intervalSeconds, err := getPositiveIntEnvVar("MONGODB_PING_INTERVAL_SECONDS", 5) + if err != nil { + return nil, err + } + + totalCACertTimeoutSeconds, err := getPositiveIntEnvVar("CA_CERT_MOUNT_TIMEOUT_TOTAL_SECONDS", 360) + if err != nil { + return nil, err + } + + intervalCACertSeconds, err := getPositiveIntEnvVar("CA_CERT_READ_INTERVAL_SECONDS", 5) + if err != nil { + return nil, err + } + + unprocessedEventsMetricUpdateIntervalSeconds, err := + getPositiveIntEnvVar("UNPROCESSED_EVENTS_METRIC_UPDATE_INTERVAL_SECONDS", 25) + if err != nil { + return nil, err + } + + return &EnvConfig{ + Namespace: envVars["POD_NAMESPACE"], + MongoURI: envVars["MONGODB_URI"], + MongoDatabase: envVars["MONGODB_DATABASE_NAME"], + MongoCollection: envVars["MONGODB_COLLECTION_NAME"], + TokenDatabase: envVars["MONGODB_DATABASE_NAME"], + TokenCollection: envVars["MONGODB_TOKEN_COLLECTION_NAME"], + TotalTimeoutSeconds: totalTimeoutSeconds, + IntervalSeconds: intervalSeconds, + TotalCACertTimeoutSeconds: totalCACertTimeoutSeconds, + IntervalCACertSeconds: intervalCACertSeconds, + UnprocessedEventsMetricUpdateIntervalSeconds: unprocessedEventsMetricUpdateIntervalSeconds, + }, nil +} + +func getPositiveIntEnvVar(name string, defaultValue int) (int, error) { + value, err := configmanager.GetEnvVar[int](name, &defaultValue, + func(v int) error { + if v <= 0 { + return fmt.Errorf("must be positive") + } + + return nil + }) + if err != nil { + return 0, fmt.Errorf("invalid %s: %w", name, err) + } + + return value, nil +} + +func createMongoConfig(envConfig *EnvConfig, mongoClientCertMountPath string) storewatcher.MongoDBConfig { + return storewatcher.MongoDBConfig{ + URI: envConfig.MongoURI, + Database: envConfig.MongoDatabase, + Collection: envConfig.MongoCollection, + ClientTLSCertConfig: storewatcher.MongoDBClientTLSCertConfig{ + TlsCertPath: filepath.Join(mongoClientCertMountPath, "tls.crt"), + TlsKeyPath: filepath.Join(mongoClientCertMountPath, "tls.key"), + CaCertPath: filepath.Join(mongoClientCertMountPath, "ca.crt"), + }, + TotalPingTimeoutSeconds: envConfig.TotalTimeoutSeconds, + TotalPingIntervalSeconds: envConfig.IntervalSeconds, + TotalCACertTimeoutSeconds: envConfig.TotalCACertTimeoutSeconds, + TotalCACertIntervalSeconds: envConfig.IntervalCACertSeconds, + } +} + +func createTokenConfig(envConfig *EnvConfig) storewatcher.TokenConfig { + return storewatcher.TokenConfig{ + ClientName: "fault-quarantine-module", + TokenDatabase: envConfig.TokenDatabase, + TokenCollection: envConfig.TokenCollection, + } +} + +func createMongoPipeline() mongo.Pipeline { + return mongo.Pipeline{ + bson.D{ + bson.E{Key: "$match", Value: bson.D{ + bson.E{Key: "operationType", Value: bson.D{ + bson.E{Key: "$in", Value: bson.A{"insert"}}, + }}, + }}, + }, + } +} + +func createReconcilerConfig( + tomlCfg config.TomlConfig, + dryRun bool, + circuitBreakerEnabled bool, +) reconciler.ReconcilerConfig { + return reconciler.ReconcilerConfig{ + TomlConfig: tomlCfg, + DryRun: dryRun, + CircuitBreakerEnabled: circuitBreakerEnabled, + } +} + +func initializeMongoCollection( + ctx context.Context, + mongoConfig storewatcher.MongoDBConfig, +) (*mongo.Collection, error) { + collection, err := storewatcher.GetCollectionClient(ctx, mongoConfig) + if err != nil { + return nil, fmt.Errorf("failed to get MongoDB collection: %w", err) + } + + return collection, nil +} + +func initializeCircuitBreaker( + ctx context.Context, + k8sClient *informer.FaultQuarantineClient, + namespace string, + percentage int, + duration time.Duration, +) (breaker.CircuitBreaker, error) { + circuitBreakerName := "fault-quarantine-circuit-breaker" + + slog.Info("Initializing circuit breaker", "configMap", circuitBreakerName, "namespace", namespace) + + cb, err := breaker.NewSlidingWindowBreaker(ctx, breaker.Config{ + Window: duration, + TripPercentage: float64(percentage), + K8sClient: k8sClient, + ConfigMapName: circuitBreakerName, + ConfigMapNamespace: namespace, + }) + if err != nil { + return nil, fmt.Errorf("failed to initialize circuit breaker: %w", err) + } + + return cb, nil +} diff --git a/fault-quarantine-module/pkg/reconciler/metrics.go b/fault-quarantine-module/pkg/metrics/metrics.go similarity index 61% rename from fault-quarantine-module/pkg/reconciler/metrics.go rename to fault-quarantine-module/pkg/metrics/metrics.go index 49b5b7f8f..a1aa45105 100644 --- a/fault-quarantine-module/pkg/reconciler/metrics.go +++ b/fault-quarantine-module/pkg/metrics/metrics.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package reconciler +package metrics import ( "github.com/prometheus/client_golang/prometheus" @@ -21,25 +21,25 @@ import ( var ( // Event Processing Metrics - totalEventsReceived = promauto.NewCounter( + TotalEventsReceived = promauto.NewCounter( prometheus.CounterOpts{ Name: "fault_quarantine_events_received_total", Help: "Total number of events received from the watcher.", }, ) - totalEventsSuccessfullyProcessed = promauto.NewCounter( + TotalEventsSuccessfullyProcessed = promauto.NewCounter( prometheus.CounterOpts{ Name: "fault_quarantine_events_successfully_processed_total", Help: "Total number of events successfully processed.", }, ) - totalEventsSkipped = promauto.NewCounter( + TotalEventsSkipped = promauto.NewCounter( prometheus.CounterOpts{ Name: "fault_quarantine_events_skipped_total", Help: "Total number of events received on already cordoned node", }, ) - processingErrors = promauto.NewCounterVec( + ProcessingErrors = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_processing_errors_total", Help: "Total number of errors encountered during event processing.", @@ -48,21 +48,21 @@ var ( ) // Node Quarantine Metrics - totalNodesQuarantined = promauto.NewCounterVec( + TotalNodesQuarantined = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_nodes_quarantined_total", Help: "Total number of nodes quarantined.", }, []string{"node"}, ) - totalNodesUnquarantined = promauto.NewCounterVec( + TotalNodesUnquarantined = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_nodes_unquarantined_total", Help: "Total number of nodes unquarantined.", }, []string{"node"}, ) - currentQuarantinedNodes = promauto.NewGaugeVec( + CurrentQuarantinedNodes = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "fault_quarantine_current_quarantined_nodes", Help: "Current number of quarantined nodes.", @@ -71,27 +71,27 @@ var ( ) // Taint and Cordon Metrics - taintsApplied = promauto.NewCounterVec( + TaintsApplied = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_taints_applied_total", Help: "Total number of taints applied to nodes.", }, []string{"taint_key", "taint_effect"}, ) - taintsRemoved = promauto.NewCounterVec( + TaintsRemoved = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_taints_removed_total", Help: "Total number of taints removed from nodes.", }, []string{"taint_key", "taint_effect"}, ) - cordonsApplied = promauto.NewCounter( + CordonsApplied = promauto.NewCounter( prometheus.CounterOpts{ Name: "fault_quarantine_cordons_applied_total", Help: "Total number of cordons applied to nodes.", }, ) - cordonsRemoved = promauto.NewCounter( + CordonsRemoved = promauto.NewCounter( prometheus.CounterOpts{ Name: "fault_quarantine_cordons_removed_total", Help: "Total number of cordons removed from nodes.", @@ -99,21 +99,21 @@ var ( ) // Ruleset Evaluation Metrics - rulesetEvaluations = promauto.NewCounterVec( + RulesetEvaluations = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_ruleset_evaluations_total", Help: "Total number of ruleset evaluations.", }, []string{"ruleset"}, ) - rulesetPassed = promauto.NewCounterVec( + RulesetPassed = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_ruleset_passed_total", Help: "Total number of ruleset evaluations that passed.", }, []string{"ruleset"}, ) - rulesetFailed = promauto.NewCounterVec( + RulesetFailed = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "fault_quarantine_ruleset_failed_total", Help: "Total number of ruleset evaluations that failed.", @@ -122,7 +122,7 @@ var ( ) // Performance Metrics - eventHandlingDuration = promauto.NewHistogram( + EventHandlingDuration = promauto.NewHistogram( prometheus.HistogramOpts{ Name: "fault_quarantine_event_handling_duration_seconds", Help: "Histogram of event handling durations.", @@ -137,4 +137,50 @@ var ( Help: "Number of health events which fault quarantine is yet to process.", }, ) + + // Circuit Breaker Metrics + FaultQuarantineBreakerState = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "fault_quarantine_breaker_state", + Help: "State of the fault quarantine breaker.", + }, + []string{"state"}, + ) + FaultQuarantineBreakerUtilization = promauto.NewGauge( + prometheus.GaugeOpts{ + Name: "fault_quarantine_breaker_utilization", + Help: "Utilization of the fault quarantine breaker.", + }, + ) + FaultQuarantineGetTotalNodesDuration = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "fault_quarantine_get_total_nodes_duration_seconds", + Help: "Duration of getTotalNodesWithRetry calls in seconds.", + Buckets: prometheus.DefBuckets, + }, + []string{"result"}, + ) + FaultQuarantineGetTotalNodesErrors = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "fault_quarantine_get_total_nodes_errors_total", + Help: "Total number of errors from getTotalNodesWithRetry.", + }, + []string{"error_type"}, + ) + FaultQuarantineGetTotalNodesRetryAttempts = promauto.NewHistogram( + prometheus.HistogramOpts{ + Name: "fault_quarantine_get_total_nodes_retry_attempts", + Help: "Number of retry attempts needed for getTotalNodesWithRetry.", + Buckets: []float64{0, 1, 2, 3, 5, 10}, + }, + ) ) + +func SetFaultQuarantineBreakerUtilization(utilization float64) { + FaultQuarantineBreakerUtilization.Set(utilization) +} + +func SetFaultQuarantineBreakerState(state string) { + FaultQuarantineBreakerState.Reset() + FaultQuarantineBreakerState.WithLabelValues(state).Set(1) +} diff --git a/fault-quarantine-module/pkg/mongodb/event_watcher.go b/fault-quarantine-module/pkg/mongodb/event_watcher.go new file mode 100644 index 000000000..38b1641e0 --- /dev/null +++ b/fault-quarantine-module/pkg/mongodb/event_watcher.go @@ -0,0 +1,239 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mongodb + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/nvidia/nvsentinel/data-models/pkg/model" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/metrics" + "github.com/nvidia/nvsentinel/store-client-sdk/pkg/storewatcher" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/bson/primitive" + "go.mongodb.org/mongo-driver/mongo" +) + +type EventWatcher struct { + mongoConfig storewatcher.MongoDBConfig + tokenConfig storewatcher.TokenConfig + mongoPipeline mongo.Pipeline + collection *mongo.Collection + watcher *storewatcher.ChangeStreamWatcher + processEventCallback func( + ctx context.Context, + event *model.HealthEventWithStatus, + ) *model.Status + unprocessedEventsMetricUpdateInterval time.Duration + lastProcessedObjectID LastProcessedObjectIDStore +} + +type LastProcessedObjectIDStore interface { + StoreLastProcessedObjectID(objID primitive.ObjectID) + LoadLastProcessedObjectID() (primitive.ObjectID, bool) +} + +type EventWatcherInterface interface { + Start(ctx context.Context) error + SetProcessEventCallback( + callback func( + ctx context.Context, + event *model.HealthEventWithStatus, + ) *model.Status, + ) +} + +func NewEventWatcher( + mongoConfig storewatcher.MongoDBConfig, + tokenConfig storewatcher.TokenConfig, + mongoPipeline mongo.Pipeline, + collection *mongo.Collection, + unprocessedEventsMetricUpdateInterval time.Duration, + lastProcessedObjectID LastProcessedObjectIDStore, +) *EventWatcher { + return &EventWatcher{ + mongoConfig: mongoConfig, + tokenConfig: tokenConfig, + mongoPipeline: mongoPipeline, + collection: collection, + unprocessedEventsMetricUpdateInterval: unprocessedEventsMetricUpdateInterval, + lastProcessedObjectID: lastProcessedObjectID, + } +} + +func (w *EventWatcher) SetProcessEventCallback( + callback func( + ctx context.Context, + event *model.HealthEventWithStatus, + ) *model.Status, +) { + w.processEventCallback = callback +} + +func (w *EventWatcher) Start(ctx context.Context) error { + slog.Info("Starting MongoDB event watcher") + + watcher, err := storewatcher.NewChangeStreamWatcher(ctx, w.mongoConfig, w.tokenConfig, w.mongoPipeline) + if err != nil { + return fmt.Errorf("failed to create change stream watcher: %w", err) + } + + w.watcher = watcher + + watcher.Start(ctx) + slog.Info("MongoDB change stream watcher started successfully") + + go w.updateUnprocessedEventsMetric(ctx, watcher) + + watchDoneCh := make(chan error, 1) + + go func() { + err := w.watchEvents(ctx, watcher) + if err != nil { + slog.Error("MongoDB event watcher goroutine failed", "error", err) + watchDoneCh <- err + } else { + slog.Error("MongoDB event watcher goroutine exited unexpectedly, event processing has stopped") + watchDoneCh <- fmt.Errorf("event watcher channel closed unexpectedly") + } + }() + + var watchErr error + select { + case <-ctx.Done(): + slog.Info("Context cancelled, stopping MongoDB event watcher") + case err := <-watchDoneCh: + slog.Error("Event watcher terminated unexpectedly, initiating shutdown", "error", err) + watchErr = fmt.Errorf("event watcher terminated: %w", err) + } + + watcher.Close(ctx) + + return watchErr +} + +func (w *EventWatcher) watchEvents(ctx context.Context, watcher *storewatcher.ChangeStreamWatcher) error { + for event := range watcher.Events() { + metrics.TotalEventsReceived.Inc() + + if processErr := w.processEvent(ctx, event); processErr != nil { + slog.Error("Event processing failed, but still marking as processed to proceed ahead", "error", processErr) + } + + if err := w.watcher.MarkProcessed(ctx); err != nil { + metrics.ProcessingErrors.WithLabelValues("mark_processed_error").Inc() + slog.Error("Error updating resume token", "error", err) + + return fmt.Errorf("failed to mark event as processed: %w", err) + } + } + + return nil +} + +func (w *EventWatcher) processEvent(ctx context.Context, event bson.M) error { + healthEventWithStatus := model.HealthEventWithStatus{} + err := storewatcher.UnmarshalFullDocumentFromEvent( + event, + &healthEventWithStatus, + ) + + if err != nil { + metrics.ProcessingErrors.WithLabelValues("unmarshal_error").Inc() + return fmt.Errorf("failed to unmarshal event: %w", err) + } + + slog.Debug("Processing event", "event", healthEventWithStatus) + w.storeEventObjectID(event) + + startTime := time.Now() + status := w.processEventCallback(ctx, &healthEventWithStatus) + + if status != nil { + if err := w.updateNodeQuarantineStatus(ctx, event, status); err != nil { + metrics.ProcessingErrors.WithLabelValues("update_quarantine_status_error").Inc() + return fmt.Errorf("failed to update node quarantine status: %w", err) + } + } + + duration := time.Since(startTime).Seconds() + metrics.EventHandlingDuration.Observe(duration) + + return nil +} + +func (w *EventWatcher) storeEventObjectID(eventBson bson.M) { + if fullDoc, ok := eventBson["fullDocument"].(bson.M); ok { + if objID, ok := fullDoc["_id"].(primitive.ObjectID); ok { + w.lastProcessedObjectID.StoreLastProcessedObjectID(objID) + } + } +} + +func (w *EventWatcher) updateUnprocessedEventsMetric(ctx context.Context, + watcher *storewatcher.ChangeStreamWatcher) { + ticker := time.NewTicker(w.unprocessedEventsMetricUpdateInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + objID, ok := w.lastProcessedObjectID.LoadLastProcessedObjectID() + if !ok { + continue + } + + unprocessedCount, err := watcher.GetUnprocessedEventCount(ctx, objID) + if err != nil { + slog.Debug("Failed to get unprocessed event count", "error", err) + continue + } + + metrics.EventBacklogSize.Set(float64(unprocessedCount)) + slog.Debug("Updated unprocessed events metric", "count", unprocessedCount, "afterObjectID", objID.Hex()) + } + } +} + +func (w *EventWatcher) updateNodeQuarantineStatus( + ctx context.Context, + event bson.M, + nodeQuarantinedStatus *model.Status, +) error { + document, ok := event["fullDocument"].(bson.M) + if !ok { + return fmt.Errorf("error extracting fullDocument from event") + } + + filter := bson.M{"_id": document["_id"]} + + update := bson.M{ + "$set": bson.M{ + "healtheventstatus.nodequarantined": *nodeQuarantinedStatus, + }, + } + + if _, err := w.collection.UpdateOne(ctx, filter, update); err != nil { + return fmt.Errorf("error updating document with _id: %v, error: %w", document["_id"], err) + } + + slog.Info("Document updated with status", "id", document["_id"], "status", *nodeQuarantinedStatus) + + return nil +} diff --git a/fault-quarantine-module/pkg/nodeinfo/nodeinfo.go b/fault-quarantine-module/pkg/nodeinfo/nodeinfo.go deleted file mode 100644 index 6e8ded586..000000000 --- a/fault-quarantine-module/pkg/nodeinfo/nodeinfo.go +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package nodeinfo - -import ( - "context" - "log/slog" - "sync" - - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" -) - -type NodeInfo struct { - quarantinedNodesMap map[string]bool - mutex sync.RWMutex - workSignal chan struct{} -} - -func NewNodeInfo(workSignal chan struct{}) *NodeInfo { - return &NodeInfo{ - quarantinedNodesMap: make(map[string]bool), - workSignal: workSignal, - } -} - -// GetQuarantinedNodesCount returns the number of quarantined nodes in a thread-safe manner -func (n *NodeInfo) GetQuarantinedNodesCount() int { - n.mutex.RLock() - defer n.mutex.RUnlock() - - return len(n.quarantinedNodesMap) -} - -// GetQuarantinedNodesCopy returns a copy of the quarantined nodes map in a thread-safe manner -func (n *NodeInfo) GetQuarantinedNodesCopy() map[string]bool { - n.mutex.RLock() - defer n.mutex.RUnlock() - - copy := make(map[string]bool) - for k, v := range n.quarantinedNodesMap { - copy[k] = v - } - - return copy -} - -func (n *NodeInfo) BuildQuarantinedNodesMap(k8sClient kubernetes.Interface) error { - nodes, err := k8sClient.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) - if err != nil { - return err - } - - n.mutex.Lock() - defer n.mutex.Unlock() - - for _, node := range nodes.Items { - key := common.QuarantineHealthEventIsCordonedAnnotationKey - if node.Annotations[key] == common.QuarantineHealthEventIsCordonedAnnotationValueTrue { - n.quarantinedNodesMap[node.Name] = true - } - } - - return nil -} - -func (n *NodeInfo) MarkNodeQuarantineStatusCache(nodeName string, isQuarantined bool, annotationExist bool) { - n.mutex.Lock() - defer n.mutex.Unlock() - - if isQuarantined { - n.quarantinedNodesMap[nodeName] = true - } else if !annotationExist { - // this is a case where a node is uncordoned manually, but annotation is still present. - // So, we should not remove it from the cache. - delete(n.quarantinedNodesMap, nodeName) - } - - slog.Debug("Quarantined nodes map", "map", n.quarantinedNodesMap, "count", len(n.quarantinedNodesMap)) - - n.signalWork() -} - -func (n *NodeInfo) GetNodeQuarantineStatusCache(nodeName string) bool { - n.mutex.RLock() - defer n.mutex.RUnlock() - - if _, exists := n.quarantinedNodesMap[nodeName]; !exists { - return false - } - - return true -} - -// signalWork sends a non-blocking signal to the reconciler's work channel. -func (n *NodeInfo) signalWork() { - if n.workSignal == nil { - slog.Error("No channel configured for node informer", "nodeInformer", n) - return // No channel configured - } - select { - case n.workSignal <- struct{}{}: - slog.Debug("Signalled work channel due to node change.") - default: - slog.Debug("Work channel already signalled, skipping signal for node change.") - } -} diff --git a/fault-quarantine-module/pkg/nodeinfo/nodeinfo_test.go b/fault-quarantine-module/pkg/nodeinfo/nodeinfo_test.go deleted file mode 100644 index 90ffe1b85..000000000 --- a/fault-quarantine-module/pkg/nodeinfo/nodeinfo_test.go +++ /dev/null @@ -1,552 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package nodeinfo - -import ( - "context" - "errors" - "testing" - "time" - - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/fake" - corev1 "k8s.io/client-go/kubernetes/typed/core/v1" -) - -func TestGetQuarantinedNodesMap(t *testing.T) { - nodeInfo := NewNodeInfo(make(chan struct{}, 1)) - nodeInfo.quarantinedNodesMap["node1"] = true - - // Test GetQuarantinedNodesCount - count := nodeInfo.GetQuarantinedNodesCount() - if count != 1 { - t.Errorf("Expected count to be 1, got %d", count) - } - - // Test GetQuarantinedNodesCopy - mapCopy := nodeInfo.GetQuarantinedNodesCopy() - if len(mapCopy) != 1 { - t.Errorf("Expected map to have 1 entry, got %d", len(mapCopy)) - } - - if !mapCopy["node1"] { - t.Error("Expected node1 to be in the map with value true") - } -} - -func TestBuildQuarantinedNodesMap(t *testing.T) { - tests := []struct { - name string - nodes []v1.Node - expectedMapKeys []string - expectedError error - }{ - { - name: "Empty node list", - nodes: []v1.Node{}, - expectedMapKeys: []string{}, - expectedError: nil, - }, - { - name: "No quarantined nodes", - nodes: []v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Annotations: map[string]string{}, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - Annotations: map[string]string{ - "other-annotation": "value", - }, - }, - }, - }, - expectedMapKeys: []string{}, - expectedError: nil, - }, - { - name: "Some quarantined nodes", - nodes: []v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Annotations: map[string]string{ - common.QuarantineHealthEventIsCordonedAnnotationKey: common.QuarantineHealthEventIsCordonedAnnotationValueTrue, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - Annotations: map[string]string{}, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node3", - Annotations: map[string]string{ - common.QuarantineHealthEventIsCordonedAnnotationKey: common.QuarantineHealthEventIsCordonedAnnotationValueTrue, - }, - }, - }, - }, - expectedMapKeys: []string{"node1", "node3"}, - expectedError: nil, - }, - { - name: "Incorrect annotation value", - nodes: []v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Annotations: map[string]string{ - common.QuarantineHealthEventIsCordonedAnnotationKey: "False", - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - Annotations: map[string]string{ - common.QuarantineHealthEventIsCordonedAnnotationKey: common.QuarantineHealthEventIsCordonedAnnotationValueTrue, - }, - }, - }, - }, - expectedMapKeys: []string{"node2"}, - expectedError: nil, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Create a fake k8s client with the test nodes - fakeClient := fake.NewSimpleClientset() - for _, node := range tt.nodes { - _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), &node, metav1.CreateOptions{}) - if err != nil { - t.Fatalf("Failed to create fake node: %v", err) - } - } - - nodeInfo := NewNodeInfo(make(chan struct{}, 1)) - err := nodeInfo.BuildQuarantinedNodesMap(fakeClient) - - // Check error - if (err != nil && tt.expectedError == nil) || (err == nil && tt.expectedError != nil) { - t.Errorf("Expected error %v, got %v", tt.expectedError, err) - } - - // Check map contents - nodeMap := nodeInfo.quarantinedNodesMap - if len(nodeMap) != len(tt.expectedMapKeys) { - t.Errorf("Expected %d nodes in map, got %d", len(tt.expectedMapKeys), len(nodeMap)) - } - - for _, key := range tt.expectedMapKeys { - if !nodeMap[key] { - t.Errorf("Expected node %s to be in map", key) - } - } - }) - } -} - -func TestMarkNodeQuarantineStatusCache(t *testing.T) { - tests := []struct { - name string - initialMap map[string]bool - nodeToMark string - isQuarantined bool - expectedMap map[string]bool - expectedSignalCalled bool - }{ - { - name: "Mark node as quarantined", - initialMap: map[string]bool{ - "node1": true, - }, - nodeToMark: "node2", - isQuarantined: true, - expectedMap: map[string]bool{ - "node1": true, - "node2": true, - }, - expectedSignalCalled: true, - }, - { - name: "Mark node as unquarantined", - initialMap: map[string]bool{ - "node1": true, - "node2": true, - }, - nodeToMark: "node2", - isQuarantined: false, - expectedMap: map[string]bool{ - "node1": true, - }, - expectedSignalCalled: true, - }, - { - name: "Mark non-existent node as unquarantined", - initialMap: map[string]bool{ - "node1": true, - }, - nodeToMark: "node2", - isQuarantined: false, - expectedMap: map[string]bool{ - "node1": true, - }, - expectedSignalCalled: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - workSignal := make(chan struct{}, 1) - nodeInfo := NewNodeInfo(workSignal) - - // Initialize the map - for node, status := range tt.initialMap { - nodeInfo.quarantinedNodesMap[node] = status - } - - // Call the method - nodeInfo.MarkNodeQuarantineStatusCache(tt.nodeToMark, tt.isQuarantined, false) - - // Check the map was updated correctly - if len(nodeInfo.quarantinedNodesMap) != len(tt.expectedMap) { - t.Errorf("Expected map size %d, got %d", len(tt.expectedMap), len(nodeInfo.quarantinedNodesMap)) - } - - for node, expectedStatus := range tt.expectedMap { - actualStatus, exists := nodeInfo.quarantinedNodesMap[node] - if !exists { - t.Errorf("Expected node %s to be in map", node) - continue - } - if actualStatus != expectedStatus { - t.Errorf("Expected node %s status to be %v, got %v", node, expectedStatus, actualStatus) - } - } - - // Check if signal was called - if tt.expectedSignalCalled { - select { - case <-workSignal: - // Signal was received, as expected - case <-time.After(100 * time.Millisecond): - t.Error("Expected work signal to be sent, but it wasn't") - } - } - }) - } -} - -func TestGetNodeQuarantineStatusCache(t *testing.T) { - tests := []struct { - name string - initialMap map[string]bool - nodeToCheck string - expectedBool bool - }{ - { - name: "Node is quarantined", - initialMap: map[string]bool{ - "node1": true, - "node2": true, - }, - nodeToCheck: "node1", - expectedBool: true, - }, - { - name: "Node is not quarantined", - initialMap: map[string]bool{ - "node1": true, - }, - nodeToCheck: "node2", - expectedBool: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - nodeInfo := NewNodeInfo(make(chan struct{}, 1)) - - // Initialize the map - for node, status := range tt.initialMap { - nodeInfo.quarantinedNodesMap[node] = status - } - - // Call the method - result := nodeInfo.GetNodeQuarantineStatusCache(tt.nodeToCheck) - - // Check the result - if result != tt.expectedBool { - t.Errorf("Expected %v, got %v", tt.expectedBool, result) - } - }) - } -} - -func TestSignalWork(t *testing.T) { - tests := []struct { - name string - workSignal chan struct{} - preloadChannel bool - expectSignalSent bool - }{ - { - name: "Signal sent on empty channel", - workSignal: make(chan struct{}, 1), - preloadChannel: false, - expectSignalSent: true, - }, - { - name: "Signal not sent on full channel", - workSignal: make(chan struct{}, 1), - preloadChannel: true, - expectSignalSent: false, - }, - { - name: "Nil channel", - workSignal: nil, - preloadChannel: false, - expectSignalSent: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - nodeInfo := NewNodeInfo(tt.workSignal) - - // Preload the channel if needed - if tt.preloadChannel && tt.workSignal != nil { - tt.workSignal <- struct{}{} - } - - // Call the method - nodeInfo.signalWork() - - // Check if signal was sent - if tt.expectSignalSent { - select { - case <-tt.workSignal: - // Signal was received, as expected - case <-time.After(100 * time.Millisecond): - t.Error("Expected work signal to be sent, but it wasn't") - } - } else if tt.workSignal != nil { - // Only check for unexpected signals if the channel is not nil - select { - case <-tt.workSignal: - if !tt.preloadChannel { - t.Error("Unexpected signal received") - } - case <-time.After(100 * time.Millisecond): - // No signal, as expected for a full channel - } - } - }) - } -} - -// MockK8sClient is a mock implementation of kubernetes.Interface for error case testing -type MockK8sClient struct { - kubernetes.Interface - shouldReturnError bool -} - -func (m *MockK8sClient) CoreV1() corev1.CoreV1Interface { - return &MockCoreV1Client{shouldReturnError: m.shouldReturnError} -} - -type MockCoreV1Client struct { - corev1.CoreV1Interface - shouldReturnError bool -} - -func (m *MockCoreV1Client) Nodes() corev1.NodeInterface { - return &MockNodeClient{shouldReturnError: m.shouldReturnError} -} - -type MockNodeClient struct { - corev1.NodeInterface - shouldReturnError bool -} - -func (m *MockNodeClient) List(ctx context.Context, opts metav1.ListOptions) (*v1.NodeList, error) { - if m.shouldReturnError { - return nil, errors.New("mock error") - } - return &v1.NodeList{}, nil -} - -// TestMarkNodeQuarantineStatusCache_ManualUncordon tests the specific scenario -// where a node is manually uncordoned but still has quarantine annotations -func TestMarkNodeQuarantineStatusCache_ManualUncordon(t *testing.T) { - workSignal := make(chan struct{}, 10) - nodeInfo := NewNodeInfo(workSignal) - - // Step 1: Node is quarantined by FQM - nodeInfo.MarkNodeQuarantineStatusCache("test-node", true, true) - - // Verify node is in quarantine map - if !nodeInfo.GetNodeQuarantineStatusCache("test-node") { - t.Fatal("Expected node to be quarantined after initial quarantine") - } - - // Drain any existing signals - for { - select { - case <-workSignal: - // keep draining - default: - goto done - } - } -done: - - // Step 2: Node is manually uncordoned (isQuarantined=false) but annotation still exists - nodeInfo.MarkNodeQuarantineStatusCache("test-node", false, true) - - // Verify node is STILL in quarantine map because annotation exists - if !nodeInfo.GetNodeQuarantineStatusCache("test-node") { - t.Error("Expected node to remain in quarantine map when manually uncordoned but annotation exists") - } - - // Verify the internal map still has the node - nodeInfo.mutex.RLock() - _, exists := nodeInfo.quarantinedNodesMap["test-node"] - nodeInfo.mutex.RUnlock() - - if !exists { - t.Error("Expected node to remain in internal quarantinedNodesMap when annotation exists") - } - - // Verify work signal was sent - select { - case <-workSignal: - // Good, signal was sent - case <-time.After(100 * time.Millisecond): - t.Error("Expected work signal when marking manual uncordon") - } - - // Step 3: Annotation is removed (proper cleanup) - nodeInfo.MarkNodeQuarantineStatusCache("test-node", false, false) - - // Now node should be removed from quarantine map - if nodeInfo.GetNodeQuarantineStatusCache("test-node") { - t.Error("Expected node to be removed from quarantine map when annotation is removed") - } - - // Verify work signal was sent for cleanup - select { - case <-workSignal: - // Good, signal was sent - case <-time.After(100 * time.Millisecond): - t.Error("Expected work signal when removing from quarantine") - } -} - -// TestMarkNodeQuarantineStatusCache_EdgeCases tests various edge cases -func TestMarkNodeQuarantineStatusCache_EdgeCases(t *testing.T) { - tests := []struct { - name string - nodeName string - isQuarantined bool - annotationExist bool - expectedInMap bool - description string - }{ - { - name: "Quarantine with annotation", - nodeName: "node1", - isQuarantined: true, - annotationExist: true, - expectedInMap: true, - description: "Normal quarantine operation", - }, - { - name: "Quarantine without annotation (edge case)", - nodeName: "node2", - isQuarantined: true, - annotationExist: false, - expectedInMap: true, - description: "Quarantine even without annotation", - }, - { - name: "Unquarantine without annotation", - nodeName: "node3", - isQuarantined: false, - annotationExist: false, - expectedInMap: false, - description: "Normal unquarantine operation", - }, - { - name: "Manual uncordon (annotation exists)", - nodeName: "node4", - isQuarantined: false, - annotationExist: true, - expectedInMap: true, - description: "Node manually uncordoned but annotation remains", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - workSignal := make(chan struct{}, 1) - nodeInfo := NewNodeInfo(workSignal) - - // If testing unquarantine or manual uncordon, first quarantine the node - if !tt.isQuarantined || tt.name == "Manual uncordon (annotation exists)" { - nodeInfo.MarkNodeQuarantineStatusCache(tt.nodeName, true, true) - // Drain signal - select { - case <-workSignal: - // drained - default: - // nothing to drain - continue safely - } - } - - // Perform the test operation - nodeInfo.MarkNodeQuarantineStatusCache(tt.nodeName, tt.isQuarantined, tt.annotationExist) - - // Check if node is in map - isInMap := nodeInfo.GetNodeQuarantineStatusCache(tt.nodeName) - - if isInMap != tt.expectedInMap { - t.Errorf("%s: expected node in map = %v, got %v. %s", - tt.name, tt.expectedInMap, isInMap, tt.description) - } - - // Verify work signal was sent - select { - case <-workSignal: - // Good - case <-time.After(100 * time.Millisecond): - t.Errorf("%s: expected work signal to be sent", tt.name) - } - }) - } -} diff --git a/fault-quarantine-module/pkg/reconciler/node_quarantine.go b/fault-quarantine-module/pkg/reconciler/node_quarantine.go deleted file mode 100644 index bba70cab6..000000000 --- a/fault-quarantine-module/pkg/reconciler/node_quarantine.go +++ /dev/null @@ -1,507 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package reconciler - -import ( - "context" - "fmt" - "log/slog" - "time" - - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" - - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - "k8s.io/client-go/util/retry" -) - -// other modules may also update the node, so we need to make sure that we retry on conflict -var customBackoff = wait.Backoff{ - Steps: 10, - Duration: 10 * time.Millisecond, - Factor: 1.5, - Jitter: 0.1, -} - -type FaultQuarantineClient struct { - // client is the Kubernetes client - clientset kubernetes.Interface - dryRunMode bool - nodeInformer NodeInfoProvider -} - -// NodeInfoProvider defines the interface for getting node counts efficiently -type NodeInfoProvider interface { - GetGpuNodeCounts() (totalGpuNodes int, cordonedNodesMap map[string]bool, err error) - HasSynced() bool -} - -func NewFaultQuarantineClient(kubeconfig string, dryRun bool) (*FaultQuarantineClient, error) { - config, err := rest.InClusterConfig() - if err != nil { - if kubeconfig == "" { - return nil, fmt.Errorf("kubeconfig is not set") - } - - // build config from kubeconfig file - config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) - if err != nil { - return nil, fmt.Errorf("error creating Kubernetes config from kubeconfig: %w", err) - } - } - - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return nil, fmt.Errorf("error creating clientset: %w", err) - } - - client := &FaultQuarantineClient{ - clientset: clientset, - dryRunMode: dryRun, - } - - return client, nil -} - -func (c *FaultQuarantineClient) GetK8sClient() kubernetes.Interface { - return c.clientset -} - -func (c *FaultQuarantineClient) EnsureCircuitBreakerConfigMap(ctx context.Context, - name, namespace string, initialStatus string) error { - slog.Info("Ensuring circuit breaker config map", - "name", name, - "namespace", namespace, - "initialStatus", initialStatus) - - cmClient := c.clientset.CoreV1().ConfigMaps(namespace) - - _, err := cmClient.Get(ctx, name, metav1.GetOptions{}) - if err == nil { - slog.Info("Circuit breaker config map already exists", - "name", name, - "namespace", namespace) - - return nil - } - - if !errors.IsNotFound(err) { - slog.Error("Error getting circuit breaker config map", - "name", name, - "namespace", namespace, - "error", err) - - return fmt.Errorf("failed to get circuit breaker config map %s/%s: %w", namespace, name, err) - } - - cm := &v1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, - Data: map[string]string{"status": initialStatus}, - } - - _, err = cmClient.Create(ctx, cm, metav1.CreateOptions{}) - if err != nil { - slog.Error("Error creating circuit breaker config map", - "name", name, - "namespace", namespace, - "error", err) - - return fmt.Errorf("failed to create circuit breaker config map %s/%s: %w", namespace, name, err) - } - - return nil -} - -func (c *FaultQuarantineClient) GetTotalGpuNodes(ctx context.Context) (int, error) { - // Use NodeInformer lister if available and synced (much more efficient) - if c.nodeInformer.HasSynced() { - totalNodes, _, err := c.nodeInformer.GetGpuNodeCounts() - if err == nil { - slog.Debug("Got total GPU nodes from NodeInformer lister", "totalNodes", totalNodes) - return totalNodes, nil - } - - slog.Debug("NodeInformer failed, falling back to API", "error", err) - } - - nodes, err := c.clientset.CoreV1().Nodes().List(ctx, - metav1.ListOptions{LabelSelector: "nvidia.com/gpu.present=true"}) - if err != nil { - return 0, fmt.Errorf("failed to list GPU nodes: %w", err) - } - - slog.Debug("Got total GPU nodes from K8s API", "totalNodes", len(nodes.Items)) - - return len(nodes.Items), nil -} - -func (c *FaultQuarantineClient) SetNodeInformer(nodeInformer NodeInfoProvider) { - c.nodeInformer = nodeInformer -} - -func (c *FaultQuarantineClient) ReadCircuitBreakerState(ctx context.Context, name, namespace string) (string, error) { - slog.Info("Reading circuit breaker state from config map", - "name", name, - "namespace", namespace) - - cm, err := c.clientset.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - return "", fmt.Errorf("failed to get circuit breaker config map %s/%s: %w", namespace, name, err) - } - - if cm.Data == nil { - return "", nil - } - - return cm.Data["status"], nil -} - -func (c *FaultQuarantineClient) WriteCircuitBreakerState(ctx context.Context, name, namespace, status string) error { - cmClient := c.clientset.CoreV1().ConfigMaps(namespace) - - return retry.OnError(customBackoff, errors.IsConflict, func() error { - cm, err := cmClient.Get(ctx, name, metav1.GetOptions{}) - if err != nil { - slog.Error("Error getting circuit breaker config map", - "name", name, - "namespace", namespace, - "error", err) - - return fmt.Errorf("failed to get circuit breaker config map %s/%s: %w", namespace, name, err) - } - - if cm.Data == nil { - cm.Data = map[string]string{} - } - - cm.Data["status"] = status - - _, err = cmClient.Update(ctx, cm, metav1.UpdateOptions{}) - if err != nil { - slog.Error("Error updating circuit breaker config map", - "name", name, - "namespace", namespace, - "error", err) - - return fmt.Errorf("failed to update circuit breaker config map %s/%s: %w", namespace, name, err) - } - - return nil - }) -} - -// nolint: cyclop,gocognit //fix this as part of NGCC-21793 -func (c *FaultQuarantineClient) TaintAndCordonNodeAndSetAnnotations( - ctx context.Context, - nodename string, - taints []config.Taint, - isCordon bool, - annotations map[string]string, - labels map[string]string, -) error { - return retry.OnError(customBackoff, errors.IsConflict, func() error { - node, err := c.clientset.CoreV1().Nodes().Get(ctx, nodename, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to get node: %w", err) - } - - // Taints check - if len(taints) > 0 { - // map to track existing taints - existingTaints := make(map[config.Taint]v1.Taint) - for _, taint := range node.Spec.Taints { - existingTaints[config.Taint{Key: taint.Key, Value: taint.Value, Effect: string(taint.Effect)}] = taint - } - - for _, taintConfig := range taints { - key := config.Taint{Key: taintConfig.Key, Value: taintConfig.Value, Effect: string(taintConfig.Effect)} - - // Check if the taint is already present, if not then add it - if _, exists := existingTaints[key]; !exists { - slog.Info("Tainting node with taint config", - "node", nodename, - "taintConfig", taintConfig) - - existingTaints[key] = v1.Taint{ - Key: taintConfig.Key, - Value: taintConfig.Value, - Effect: v1.TaintEffect(taintConfig.Effect), - } - } - } - - node.Spec.Taints = []v1.Taint{} - for _, taint := range existingTaints { - node.Spec.Taints = append(node.Spec.Taints, taint) - } - } - - // Cordon check - // nolint: cyclop, gocognit, nestif //fix this as part of NGCC-21793 - if isCordon { - _, exist := node.Annotations[common.QuarantineHealthEventAnnotationKey] - if node.Spec.Unschedulable { - if exist { - slog.Info("Node already cordoned by FQM; skipping taint/annotation updates", - "node", nodename) - return nil - } - - slog.Info("Node is cordoned manually; applying FQM taints/annotations", - "node", nodename) - } else { - // Cordoning the node since it is currently schedulable. - slog.Info("Cordoning node", "node", nodename) - - if !c.dryRunMode { - node.Spec.Unschedulable = true - } - } - } - - // Annotation check - if len(annotations) > 0 { - if node.Annotations == nil { - node.Annotations = make(map[string]string) - } - - slog.Info("Setting annotations on node", - "node", nodename, - "annotations", annotations) - // set annotations - for annotationKey, annotationValue := range annotations { - node.Annotations[annotationKey] = annotationValue - } - } - - // Labels check - if len(labels) > 0 { - slog.Info("Adding labels on node", "node", nodename) - - for k, v := range labels { - node.Labels[k] = v - } - } - - _, err = c.clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) - - if err != nil { - return fmt.Errorf("failed to taint node: %w", err) - } - - return nil - }) -} - -// nolint: cyclop,gocognit //fix this as part of NGCC-21793 -func (c *FaultQuarantineClient) UnTaintAndUnCordonNodeAndRemoveAnnotations( - ctx context.Context, - nodename string, - taints []config.Taint, - isUnCordon bool, - annotationKeys []string, - labelsToRemove []string, - labels map[string]string, -) error { - return retry.OnError(customBackoff, errors.IsConflict, func() error { - node, err := c.clientset.CoreV1().Nodes().Get(ctx, nodename, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to get node: %w", err) - } - - // untaint check - if len(taints) > 0 { - taintsAlreadyPresentOnNodeMap := map[config.Taint]bool{} - for _, taint := range node.Spec.Taints { - taintsAlreadyPresentOnNodeMap[config.Taint{Key: taint.Key, Value: taint.Value, Effect: string(taint.Effect)}] = true - } - - // Check if the taints are present - toRemove := map[config.Taint]bool{} - - for _, taintConfig := range taints { - key := config.Taint{ - Key: taintConfig.Key, - Value: taintConfig.Value, - Effect: taintConfig.Effect, - } - - found := taintsAlreadyPresentOnNodeMap[key] - if !found { - slog.Info("Node already does not have the taint", - "node", nodename, - "taintConfig", taintConfig) - } else { - toRemove[taintConfig] = true - } - } - - if len(toRemove) == 0 { - return nil - } - - slog.Info("Untainting node with taint config", - "node", nodename, - "toRemove", toRemove) - - newTaints := []v1.Taint{} - - for _, taint := range node.Spec.Taints { - if toRemove[config.Taint{Key: taint.Key, Value: taint.Value, Effect: string(taint.Effect)}] { - // Skip taints that need to be removed - continue - } - - newTaints = append(newTaints, taint) - } - - node.Spec.Taints = newTaints - } - - // uncordon check - if isUnCordon { - slog.Info("Uncordoning node", "node", nodename) - - if !c.dryRunMode { - node.Spec.Unschedulable = false - } - - // Only add labels if labels map is provided (non-nil and non-empty) - if len(labels) > 0 { - slog.Info("Adding labels on node", "node", nodename) - - for k, v := range labels { - node.Labels[k] = v - } - - uncordonReason := node.Labels[cordonedReasonLabelKey] - - if len(uncordonReason) > 55 { - uncordonReason = uncordonReason[:55] - } - - node.Labels[uncordonedReasonLabelkey] = uncordonReason + "-removed" - } - } - - // Annotation check - if len(annotationKeys) > 0 && node.Annotations != nil { - // remove annotations - for _, annotationKey := range annotationKeys { - slog.Info("Removing annotation key from node", - "annotationKey", annotationKey, - "node", nodename) - delete(node.Annotations, annotationKey) - } - } - - // Label check - if len(labelsToRemove) > 0 { - for _, labelKey := range labelsToRemove { - slog.Info("Removing label key from node", - "labelKey", labelKey, - "node", nodename) - delete(node.Labels, labelKey) - } - } - - _, err = c.clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) - if err != nil { - return fmt.Errorf("failed to remove taint from node: %w", err) - } - - return nil - }) -} - -func (c *FaultQuarantineClient) GetNodeAnnotations(ctx context.Context, nodename string) (map[string]string, error) { - node, err := c.clientset.CoreV1().Nodes().Get(ctx, nodename, metav1.GetOptions{}) - if err != nil { - return nil, fmt.Errorf("failed to get node: %w", err) - } - - if node.Annotations == nil { - return map[string]string{}, nil - } - - // return a copy of the annotations map to prevent unintended modifications - annotations := make(map[string]string) - for key, value := range node.Annotations { - annotations[key] = value - } - - return annotations, nil -} - -func (c *FaultQuarantineClient) GetNodesWithAnnotation(ctx context.Context, annotationKey string) ([]string, error) { - nodes, err := c.clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - if err != nil { - return nil, fmt.Errorf("failed to list nodes: %w", err) - } - - var nodesWithAnnotation []string - - for _, node := range nodes.Items { - annotationValue, exists := node.Annotations[annotationKey] - if exists && annotationValue != "" { - nodesWithAnnotation = append(nodesWithAnnotation, node.Name) - } - } - - return nodesWithAnnotation, nil -} - -// UpdateNodeAnnotations updates only the specified annotations on a node without affecting other properties -func (c *FaultQuarantineClient) UpdateNodeAnnotations( - ctx context.Context, - nodename string, - annotations map[string]string, -) error { - return retry.OnError(customBackoff, errors.IsConflict, func() error { - node, err := c.clientset.CoreV1().Nodes().Get(ctx, nodename, metav1.GetOptions{}) - if err != nil { - return err - } - - // Update annotations - if node.Annotations == nil { - node.Annotations = make(map[string]string) - } - - for key, value := range annotations { - node.Annotations[key] = value - } - - updateOptions := metav1.UpdateOptions{} - if c.dryRunMode { - updateOptions.DryRun = []string{metav1.DryRunAll} - } - - _, err = c.clientset.CoreV1().Nodes().Update(ctx, node, updateOptions) - if err != nil { - return fmt.Errorf("failed to update node %s annotations: %w", nodename, err) - } - - slog.Info("Successfully updated annotations for node", "node", nodename) - - return nil - }) -} diff --git a/fault-quarantine-module/pkg/reconciler/node_quarantine_test.go b/fault-quarantine-module/pkg/reconciler/node_quarantine_test.go deleted file mode 100644 index 872ee8c27..000000000 --- a/fault-quarantine-module/pkg/reconciler/node_quarantine_test.go +++ /dev/null @@ -1,799 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package reconciler - -import ( - "context" - "testing" - "time" - - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" -) - -func TestTaintAndCordonNodeAndSetAnnotations(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "test-node" - - // Create a test node - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - Spec: v1.NodeSpec{}, - } - _, err := clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - if err != nil { - t.Fatalf("Failed to create test node: %v", err) - } - - k8sClient := &FaultQuarantineClient{ - clientset: clientset, - } - - taints := []config.Taint{ - { - Key: "test-key", - Value: "test-value", - Effect: "NoSchedule", - }, - } - annotations := map[string]string{ - "test-annotation": "test-value", - } - - cordonedByLabelKey = "cordon-by" - cordonedReasonLabelKey = "cordon-reason" - cordonedTimestampLabelKey = "cordon-timestamp" - - labelsMap := map[string]string{ - cordonedByLabelKey: common.ServiceName, - cordonedReasonLabelKey: "gpu-error", - cordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), - } - err = k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, nodeName, taints, true, annotations, labelsMap) - if err != nil { - t.Fatalf("TaintAndCordonNodeAndSetAnnotations failed: %v", err) - } - - updatedNode, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if err != nil { - t.Fatalf("Failed to get updated node: %v", err) - } - - // Check taints - if len(updatedNode.Spec.Taints) != 1 { - t.Errorf("Expected 1 taint, got %d", len(updatedNode.Spec.Taints)) - } - if updatedNode.Spec.Taints[0].Key != "test-key" { - t.Errorf("Unexpected taint key: %s", updatedNode.Spec.Taints[0].Key) - } - - // Check cordon - if !updatedNode.Spec.Unschedulable { - t.Errorf("Node should be cordoned") - } - if len(updatedNode.Labels) != 3 || updatedNode.Labels[cordonedByLabelKey] != common.ServiceName || updatedNode.Labels[cordonedReasonLabelKey] != "gpu-error" || updatedNode.Labels[cordonedTimestampLabelKey] == "" { - t.Errorf("Cordoned labels are not applied on node") - } - - // Check annotations - if val, ok := updatedNode.Annotations["test-annotation"]; !ok || val != "test-value" { - t.Errorf("Annotation not set correctly") - } -} - -func TestUnTaintAndUnCordonNodeAndRemoveAnnotations(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "test-node" - - cordonedByLabelKey = "cordon-by" - cordonedReasonLabelKey = "cordon-reason" - cordonedTimestampLabelKey = "cordon-timestamp" - uncordonedByLabelKey = "uncordon-by" - uncordonedReasonLabelkey = "uncordon-reason" - uncordonedTimestampLabelKey = "uncordon-timestamp" - - // Create a test node with taints, cordon, and annotations - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Annotations: map[string]string{ - "test-annotation": "test-value", - }, - Labels: map[string]string{ - cordonedByLabelKey: common.ServiceName, - cordonedReasonLabelKey: "gpu-error", - cordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), - }, - }, - Spec: v1.NodeSpec{ - Unschedulable: true, - Taints: []v1.Taint{ - { - Key: "test-key", - Value: "test-value", - Effect: v1.TaintEffect("NoSchedule"), - }, - }, - }, - } - _, err := clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - if err != nil { - t.Fatalf("Failed to create test node: %v", err) - } - - k8sClient := &FaultQuarantineClient{ - clientset: clientset, - } - - taints := []config.Taint{ - { - Key: "test-key", - Value: "test-value", - Effect: "NoSchedule", - }, - } - annotationKeys := []string{"test-annotation"} - - labelsMap := map[string]string{ - uncordonedByLabelKey: common.ServiceName, - uncordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), - } - - err = k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, nodeName, taints, true, annotationKeys, []string{cordonedByLabelKey, cordonedReasonLabelKey, cordonedTimestampLabelKey}, labelsMap) - if err != nil { - t.Fatalf("UnTaintAndUnCordonNodeAndRemoveAnnotations failed: %v", err) - } - - updatedNode, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if err != nil { - t.Fatalf("Failed to get updated node: %v", err) - } - - if len(updatedNode.Spec.Taints) != 0 { - t.Errorf("Expected 0 taints, got %d", len(updatedNode.Spec.Taints)) - } - - if updatedNode.Spec.Unschedulable { - t.Errorf("Node should be uncordoned") - } - - if _, ok := updatedNode.Annotations["test-annotation"]; ok { - t.Errorf("Annotation should be removed") - } - - _, exists1 := updatedNode.Labels[cordonedByLabelKey] - _, exists2 := updatedNode.Labels[cordonedReasonLabelKey] - _, exists3 := updatedNode.Labels[cordonedTimestampLabelKey] - - if exists1 || exists2 || exists3 { - t.Errorf("Expected cordoned labels to be removed from node") - } - - if len(updatedNode.Labels) != 3 || updatedNode.Labels[uncordonedByLabelKey] != common.ServiceName || updatedNode.Labels[uncordonedReasonLabelkey] != "gpu-error-removed" || updatedNode.Labels[uncordonedTimestampLabelKey] == "" { - t.Errorf("Expected uncordoned lables to be applied on node") - } -} - -func TestGetNodeAnnotations(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "test-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Annotations: map[string]string{ - "test-annotation": "test-value", - }, - }, - Spec: v1.NodeSpec{}, - } - _, err := clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - if err != nil { - t.Fatalf("Failed to create test node: %v", err) - } - - k8sClient := &FaultQuarantineClient{ - clientset: clientset, - } - - annotations, err := k8sClient.GetNodeAnnotations(ctx, nodeName) - if err != nil { - t.Fatalf("GetNodeAnnotations failed: %v", err) - } - - if val, exists := annotations["test-annotation"]; !exists { - t.Errorf("Expected 'test-annotation' to exist") - } else if val != "test-value" { - t.Errorf("Expected 'test-value', got '%s'", val) - } - - if val, exists := annotations["non-existent"]; exists { - t.Errorf("Expected 'non-existent' to not exist, but got '%s'", val) - } - - // ensure that modifying the returned map does not affect the original node annotations - annotations["new-annotation"] = "new-value" - originalAnnotations, err := k8sClient.GetNodeAnnotations(ctx, nodeName) - if err != nil { - t.Fatalf("GetNodeAnnotations failed: %v", err) - } - if _, exists := originalAnnotations["new-annotation"]; exists { - t.Errorf("Modifying the returned annotations map should not affect the original node annotations") - } -} - -func TestGetNodesWithAnnotation(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - - node1 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Annotations: map[string]string{ - "test-annotation": "value1", - }, - }, - Spec: v1.NodeSpec{}, - } - node2 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - Annotations: map[string]string{ - "test-annotation": "value2", - }, - }, - Spec: v1.NodeSpec{}, - } - node3 := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node3", - Annotations: map[string]string{ - "other-annotation": "value3", - }, - }, - Spec: v1.NodeSpec{}, - } - clientset.CoreV1().Nodes().Create(ctx, node1, metav1.CreateOptions{}) - clientset.CoreV1().Nodes().Create(ctx, node2, metav1.CreateOptions{}) - clientset.CoreV1().Nodes().Create(ctx, node3, metav1.CreateOptions{}) - - k8sClient := &FaultQuarantineClient{ - clientset: clientset, - } - - nodes, err := k8sClient.GetNodesWithAnnotation(ctx, "test-annotation") - if err != nil { - t.Fatalf("GetNodesWithAnnotation failed: %v", err) - } - - if len(nodes) != 2 { - t.Errorf("Expected 2 nodes, got %d", len(nodes)) - } - - // Check if nodes are correct - expectedNodes := map[string]bool{"node1": true, "node2": true} - for _, nodeName := range nodes { - if !expectedNodes[nodeName] { - t.Errorf("Unexpected node: %s", nodeName) - } - } -} - -func TestTaintAndCordonNode_NodeNotFound(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - k8sClient := &FaultQuarantineClient{clientset: clientset} - - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, "non-existent-node", nil, false, nil, map[string]string{}) - if err == nil { - t.Errorf("Expected error when node does not exist, got nil") - } -} - -func TestTaintAndCordonNode_NoChanges(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - - nodeName := "no-change-node" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - - k8sClient := &FaultQuarantineClient{clientset: clientset} - - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, nodeName, nil, false, nil, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if len(updatedNode.Spec.Taints) != 0 { - t.Errorf("Expected no taints, got %d", len(updatedNode.Spec.Taints)) - } - if updatedNode.Spec.Unschedulable { - t.Errorf("Expected node to remain schedulable") - } - if len(updatedNode.Annotations) != 0 { - t.Errorf("Expected no annotations, got %v", updatedNode.Annotations) - } -} - -func TestUnTaintAndUnCordonNode_NoChanges(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - - nodeName := "no-change-untaint-node" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - Spec: v1.NodeSpec{Unschedulable: false}, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - - k8sClient := &FaultQuarantineClient{clientset: clientset} - - // No taints to remove, node is already uncordoned, and no annotations to remove - err := k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, nodeName, nil, false, nil, []string{}, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if len(updatedNode.Spec.Taints) != 0 { - t.Errorf("Expected no taints, got %d", len(updatedNode.Spec.Taints)) - } - if updatedNode.Spec.Unschedulable { - t.Errorf("Expected node to remain uncordoned") - } - if len(updatedNode.Annotations) != 0 { - t.Errorf("Expected no annotations, got %v", updatedNode.Annotations) - } -} - -func TestUnTaintAndUnCordonNode_PartialTaintRemoval(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - - nodeName := "partial-taint-removal-node" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - Spec: v1.NodeSpec{ - Unschedulable: true, - Taints: []v1.Taint{ - {Key: "taint1", Value: "val1", Effect: v1.TaintEffectNoSchedule}, - {Key: "taint2", Value: "val2", Effect: v1.TaintEffectPreferNoSchedule}, - }, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - - k8sClient := &FaultQuarantineClient{clientset: clientset} - - taintsToRemove := []config.Taint{{Key: "taint1", Value: "val1", Effect: "NoSchedule"}} - err := k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, nodeName, taintsToRemove, false, nil, []string{}, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if len(updatedNode.Spec.Taints) != 1 { - t.Errorf("Expected 1 taint remaining, got %d", len(updatedNode.Spec.Taints)) - } - if updatedNode.Spec.Taints[0].Key != "taint2" { - t.Errorf("Expected taint2 to remain, got %s", updatedNode.Spec.Taints[0].Key) - } -} - -func TestUnTaintAndUnCordonNode_PartialAnnotationRemoval(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - - cordonedByLabelKey = "cordon-by" - cordonedReasonLabelKey = "cordon-reason" - cordonedTimestampLabelKey = "cordon-timestamp" - uncordonedByLabelKey = "uncordon-by" - uncordonedReasonLabelkey = "uncordon-reason" - uncordonedTimestampLabelKey = "uncordon-timestamp" - - nodeName := "partial-annotation-removal-node" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Annotations: map[string]string{ - "annotation1": "val1", - "annotation2": "val2", - }, - Labels: map[string]string{ - cordonedByLabelKey: common.ServiceName, - cordonedReasonLabelKey: "gpu-error", - cordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), - }, - }, - Spec: v1.NodeSpec{ - Unschedulable: true, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - - k8sClient := &FaultQuarantineClient{clientset: clientset} - - annotationsToRemove := []string{"annotation1"} - labelsMap := map[string]string{ - uncordonedByLabelKey: common.ServiceName, - uncordonedTimestampLabelKey: time.Now().UTC().Format("2006-01-02T15-04-05Z"), - } - err := k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, nodeName, nil, true, annotationsToRemove, []string{cordonedByLabelKey, cordonedReasonLabelKey, cordonedTimestampLabelKey}, labelsMap) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if updatedNode.Annotations["annotation1"] != "" { - t.Errorf("Expected annotation1 to be removed") - } - if updatedNode.Annotations["annotation2"] != "val2" { - t.Errorf("Expected annotation2 to remain") - } - if updatedNode.Spec.Unschedulable { - t.Errorf("Expected node to be uncordoned") - } -} - -func TestTaintAndCordonNode_AlreadyTaintedCOrdonned(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "already-tainted-cordoned-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - Spec: v1.NodeSpec{ - Unschedulable: true, - Taints: []v1.Taint{ - {Key: "test-key", Value: "test-value", Effect: v1.TaintEffectNoSchedule}, - }, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - taints := []config.Taint{{Key: "test-key", Value: "test-value", Effect: "NoSchedule"}} - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, nodeName, taints, true, nil, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - // Should remain unchanged - if len(updatedNode.Spec.Taints) != 1 { - t.Errorf("Expected 1 taint, got %d", len(updatedNode.Spec.Taints)) - } - if !updatedNode.Spec.Unschedulable { - t.Errorf("Node should remain cordoned") - } -} - -func TestUnTaintAndUnCordonNode_AlreadyUntaintedUncordoned(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "already-untainted-uncordoned-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - Spec: v1.NodeSpec{Unschedulable: false}, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - err := k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, nodeName, nil, true, nil, []string{}, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if len(updatedNode.Spec.Taints) != 0 { - t.Errorf("Expected no taints, got %d", len(updatedNode.Spec.Taints)) - } - if updatedNode.Spec.Unschedulable { - t.Errorf("Node should remain uncordoned") - } -} - -func TestTaintAndCordonNode_InvalidTaintEffect(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "invalid-effect-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - // Provide an invalid effect - taints := []config.Taint{{Key: "weird-key", Value: "weird-value", Effect: "SomeInvalidEffect"}} - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, nodeName, taints, false, nil, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error adding invalid effect taint, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if len(updatedNode.Spec.Taints) != 1 { - t.Errorf("Expected 1 taint, got %d", len(updatedNode.Spec.Taints)) - } - if string(updatedNode.Spec.Taints[0].Effect) != "SomeInvalidEffect" { - t.Errorf("Expected effect 'SomeInvalidEffect', got '%s'", updatedNode.Spec.Taints[0].Effect) - } -} - -func TestTaintAndCordonNode_OverwriteAnnotation(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "overwrite-annotation-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Annotations: map[string]string{"existing-key": "old-value"}, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - annotations := map[string]string{"existing-key": "new-value"} - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, nodeName, nil, false, annotations, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if updatedNode.Annotations["existing-key"] != "new-value" { - t.Errorf("Annotation value was not updated correctly") - } -} - -func TestUnTaintAndUnCordonNode_NonExistentTaintRemoval(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "non-existent-taint-removal-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - Spec: v1.NodeSpec{ - Taints: []v1.Taint{ - {Key: "taint1", Value: "val1", Effect: v1.TaintEffectNoSchedule}, - }, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - // Attempt to remove a taint that doesn't exist - taintsToRemove := []config.Taint{{Key: "taint-nonexistent", Value: "valX", Effect: "NoSchedule"}} - err := k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, nodeName, taintsToRemove, false, nil, []string{}, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - // Original taint should remain as we tried to remove a non-existent taint - if len(updatedNode.Spec.Taints) != 1 { - t.Errorf("Expected 1 taint to remain, got %d", len(updatedNode.Spec.Taints)) - } -} - -func TestUnTaintAndUnCordonNode_NonExistentAnnotationRemoval(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "non-existent-annotation-removal-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Annotations: map[string]string{ - "annotation1": "val1", - }, - }, - Spec: v1.NodeSpec{}, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - // Attempt to remove an annotation that doesn't exist - annotationsToRemove := []string{"nonexistent-annotation"} - err := k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, nodeName, nil, false, annotationsToRemove, []string{}, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - // Original annotation should remain - if updatedNode.Annotations["annotation1"] != "val1" { - t.Errorf("Non-existent annotation removal should not affect existing annotations") - } -} - -func TestTaintAndCordonNode_EmptyTaintKeyOrValue(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "empty-taint-key-value-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - // Taint with empty key and value - taints := []config.Taint{ - {Key: "", Value: "", Effect: "NoSchedule"}, - } - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, nodeName, taints, false, nil, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if len(updatedNode.Spec.Taints) != 1 { - t.Errorf("Expected 1 taint, got %d", len(updatedNode.Spec.Taints)) - } - if updatedNode.Spec.Taints[0].Key != "" || updatedNode.Spec.Taints[0].Value != "" { - t.Errorf("Expected empty key and value taint") - } -} - -func TestTaintAndCordonNode_EmptyAnnotationKey(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - nodeName := "empty-annotation-key-node" - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{}, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - annotations := map[string]string{ - "": "empty-key-value", - } - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, nodeName, nil, false, annotations, map[string]string{}) - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - updatedNode, _ := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if updatedNode.Annotations[""] != "empty-key-value" { - t.Errorf("Expected empty key annotation to be set") - } -} - -func TestGetNodesWithAnnotation_NoMatches(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - - // Create a node without the target annotation - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-without-annotation", - Annotations: map[string]string{"some-other-annotation": "value"}, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - - k8sClient := &FaultQuarantineClient{clientset: clientset} - - nodes, err := k8sClient.GetNodesWithAnnotation(ctx, "non-existent-annotation") - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - if len(nodes) != 0 { - t.Errorf("Expected no nodes, got %d", len(nodes)) - } -} - -func TestGetNodesWithAnnotation_EmptyAnnotationKey(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-with-empty-key-annotation", - Annotations: map[string]string{"": "empty-key-annotation"}, - }, - } - clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) - k8sClient := &FaultQuarantineClient{clientset: clientset} - - nodes, err := k8sClient.GetNodesWithAnnotation(ctx, "") - if err != nil { - t.Fatalf("Expected no error, got %v", err) - } - if len(nodes) != 1 { - t.Errorf("Expected 1 node with empty key annotation, got %d", len(nodes)) - } - if nodes[0] != "node-with-empty-key-annotation" { - t.Errorf("Unexpected node returned: %s", nodes[0]) - } -} - -func TestTaintAndCordonNode_NonExistentNode(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - k8sClient := &FaultQuarantineClient{clientset: clientset} - - // Attempt to taint a node that doesn't exist - err := k8sClient.TaintAndCordonNodeAndSetAnnotations(ctx, "no-such-node", nil, true, nil, map[string]string{}) - if err == nil { - t.Errorf("Expected error for non-existent node, got nil") - } -} - -func TestUnTaintAndUnCordonNode_NonExistentNode(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - k8sClient := &FaultQuarantineClient{clientset: clientset} - - // Attempt to untaint a node that doesn't exist - err := k8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx, "no-such-node", nil, true, nil, []string{}, map[string]string{}) - if err == nil { - t.Errorf("Expected error for non-existent node, got nil") - } -} - -func TestGetNodeAnnotations_NonExistentNode(t *testing.T) { - ctx := context.Background() - clientset := fake.NewSimpleClientset() - k8sClient := &FaultQuarantineClient{clientset: clientset} - - _, err := k8sClient.GetNodeAnnotations(ctx, "no-such-node") - if err == nil { - t.Errorf("Expected error for non-existent node, got nil") - } -} diff --git a/fault-quarantine-module/pkg/reconciler/reconciler.go b/fault-quarantine-module/pkg/reconciler/reconciler.go index d2848ccc1..aaca0e2f6 100644 --- a/fault-quarantine-module/pkg/reconciler/reconciler.go +++ b/fault-quarantine-module/pkg/reconciler/reconciler.go @@ -17,6 +17,7 @@ package reconciler import ( "context" "encoding/json" + "errors" "fmt" "log/slog" "regexp" @@ -34,32 +35,16 @@ import ( "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/evaluator" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/healthEventsAnnotation" "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/informer" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/nodeinfo" - "github.com/nvidia/nvsentinel/store-client-sdk/pkg/storewatcher" - - "go.mongodb.org/mongo-driver/bson" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/metrics" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/mongodb" "go.mongodb.org/mongo-driver/bson/primitive" - "go.mongodb.org/mongo-driver/mongo" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + corev1 "k8s.io/api/core/v1" ) -type CircuitBreakerConfig struct { - Namespace string - Name string - Percentage int - Duration time.Duration -} - type ReconcilerConfig struct { - TomlConfig config.TomlConfig - MongoHealthEventCollectionConfig storewatcher.MongoDBConfig - TokenConfig storewatcher.TokenConfig - MongoPipeline mongo.Pipeline - K8sClient K8sClientInterface - DryRun bool - CircuitBreakerEnabled bool - UnprocessedEventsMetricUpdateInterval time.Duration - CircuitBreaker CircuitBreakerConfig + TomlConfig config.TomlConfig + DryRun bool + CircuitBreakerEnabled bool } type rulesetsConfig struct { @@ -68,125 +53,153 @@ type rulesetsConfig struct { RuleSetPriorityMap map[string]int } +// keyValTaint represents a taint key-value pair used for deduplication and priority tracking +type keyValTaint struct { + Key string + Value string +} + type Reconciler struct { - config ReconcilerConfig - healthEventBuffer *common.HealthEventBuffer - nodeInfo *nodeinfo.NodeInfo - // workSignal acts as a semaphore to wake up the reconcile loop - workSignal chan struct{} - // nodeAnnotationsCache caches node annotations to avoid repeated K8s API calls - nodeAnnotationsCache sync.Map // map[string]map[string]string - // cacheMutex protects cache operations during refresh to ensure consistency - cacheMutex sync.RWMutex - lastProcessedObjectID atomic.Value // stores primitive.ObjectID + config ReconcilerConfig + k8sClient *informer.FaultQuarantineClient + lastProcessedObjectID atomic.Value cb breaker.CircuitBreaker -} + eventWatcher mongodb.EventWatcherInterface + taintInitKeys []keyValTaint // Pre-computed taint keys for map initialization + taintUpdateMu sync.Mutex // Protects taint priority updates -var ( // Label keys cordonedByLabelKey string cordonedReasonLabelKey string cordonedTimestampLabelKey string uncordonedByLabelKey string - uncordonedReasonLabelkey string + uncordonedReasonLabelKey string uncordonedTimestampLabelKey string -) +} -func NewReconciler(ctx context.Context, cfg ReconcilerConfig, workSignal chan struct{}) *Reconciler { - r := &Reconciler{ - config: cfg, - healthEventBuffer: common.NewHealthEventBuffer(ctx), - nodeInfo: nodeinfo.NewNodeInfo(workSignal), - workSignal: workSignal, // Store the signal channel - } - - if cfg.CircuitBreakerEnabled { - slog.Info("Initializing circuit breaker with config map %s in namespace %s", - cfg.CircuitBreaker.Name, cfg.CircuitBreaker.Namespace) - - cb, err := breaker.NewSlidingWindowBreaker(ctx, breaker.Config{ - Window: cfg.CircuitBreaker.Duration, - TripPercentage: float64(cfg.CircuitBreaker.Percentage), - GetTotalNodes: cfg.K8sClient.GetTotalGpuNodes, - EnsureConfigMap: func(c context.Context, initial breaker.State) error { - return cfg.K8sClient.EnsureCircuitBreakerConfigMap(c, - cfg.CircuitBreaker.Name, cfg.CircuitBreaker.Namespace, string(initial)) - }, - ReadStateFn: func(c context.Context) (breaker.State, error) { - val, err := cfg.K8sClient.ReadCircuitBreakerState(c, cfg.CircuitBreaker.Name, cfg.CircuitBreaker.Namespace) - if err != nil { - slog.Error("Error reading circuit breaker state from config map", - "name", cfg.CircuitBreaker.Name, "namespace", cfg.CircuitBreaker.Namespace, "error", err) - return breaker.State(""), fmt.Errorf("failed to read circuit breaker state: %w", err) - } - return breaker.State(val), nil - }, - WriteStateFn: func(c context.Context, s breaker.State) error { - return cfg.K8sClient.WriteCircuitBreakerState(c, cfg.CircuitBreaker.Name, cfg.CircuitBreaker.Namespace, string(s)) - }, - }) - if err != nil { - slog.Error("Failed to initialize circuit breaker", "error", err) - } +var ( + // Compile regex once at package initialization for efficiency + labelValueRegex = regexp.MustCompile(`[^a-zA-Z0-9_.-]`) - r.cb = cb - } else { - slog.Info("Circuit breaker is disabled, skipping initialization") + // Sentinel errors for better error handling + errNoQuarantineAnnotation = fmt.Errorf("no quarantine annotation") +) - r.cb = nil +func NewReconciler( + cfg ReconcilerConfig, + k8sClient *informer.FaultQuarantineClient, + circuitBreaker breaker.CircuitBreaker, +) *Reconciler { + r := &Reconciler{ + config: cfg, + k8sClient: k8sClient, + cb: circuitBreaker, } return r } func (r *Reconciler) SetLabelKeys(labelKeyPrefix string) { - cordonedByLabelKey = labelKeyPrefix + "cordon-by" - cordonedReasonLabelKey = labelKeyPrefix + "cordon-reason" - cordonedTimestampLabelKey = labelKeyPrefix + "cordon-timestamp" + r.cordonedByLabelKey = labelKeyPrefix + "cordon-by" + r.cordonedReasonLabelKey = labelKeyPrefix + "cordon-reason" + r.cordonedTimestampLabelKey = labelKeyPrefix + "cordon-timestamp" + + r.uncordonedByLabelKey = labelKeyPrefix + "uncordon-by" + r.uncordonedReasonLabelKey = labelKeyPrefix + "uncordon-reason" + r.uncordonedTimestampLabelKey = labelKeyPrefix + "uncordon-timestamp" +} + +func (r *Reconciler) StoreLastProcessedObjectID(objID primitive.ObjectID) { + r.lastProcessedObjectID.Store(objID) +} + +func (r *Reconciler) LoadLastProcessedObjectID() (primitive.ObjectID, bool) { + lastObjID := r.lastProcessedObjectID.Load() + if lastObjID == nil { + return primitive.ObjectID{}, false + } + + objID, ok := lastObjID.(primitive.ObjectID) + + return objID, ok +} - uncordonedByLabelKey = labelKeyPrefix + "uncordon-by" - uncordonedReasonLabelkey = labelKeyPrefix + "uncordon-reason" - uncordonedTimestampLabelKey = labelKeyPrefix + "uncordon-timestamp" +func (r *Reconciler) SetEventWatcher(eventWatcher mongodb.EventWatcherInterface) { + r.eventWatcher = eventWatcher } -// nolint: cyclop, gocognit //fix this as part of NGCC-21793 func (r *Reconciler) Start(ctx context.Context) error { - nodeInformer, err := informer.NewNodeInformer(r.config.K8sClient.GetK8sClient(), - 30*time.Minute, r.workSignal, r.nodeInfo) + r.setupNodeInformerCallbacks() + + ruleSetEvals, err := r.initializeRuleSetEvaluators() if err != nil { - return fmt.Errorf("failed to create NodeInformer: %w", err) + return fmt.Errorf("failed to initialize rule set evaluators: %w", err) } - // Set the callback to decrement the metric when a quarantined node with annotations is deleted - nodeInformer.SetOnQuarantinedNodeDeletedCallback(func(nodeName string) { - currentQuarantinedNodes.WithLabelValues(nodeName).Dec() - slog.Info("Decremented currentQuarantinedNodes metric for deleted quarantined node", "node", nodeName) - }) + r.setupLabelKeys() + + rulesetsConfig := r.buildRulesetsConfig() - // Set the callback to update the annotations cache when node annotations change - nodeInformer.SetOnNodeAnnotationsChangedCallback(r.handleNodeAnnotationChange) + r.precomputeTaintInitKeys(ruleSetEvals, rulesetsConfig) - // Set the callback to handle manual uncordon of quarantined nodes - nodeInformer.SetOnManualUncordonCallback(r.handleManualUncordon) + if !r.k8sClient.NodeInformer.WaitForSync(ctx) { + return fmt.Errorf("failed to sync NodeInformer cache") + } + + r.initializeQuarantineMetrics() + + if err := r.checkCircuitBreakerAtStartup(ctx); err != nil { + return err + } - if fqClient, ok := r.config.K8sClient.(*FaultQuarantineClient); ok { - fqClient.SetNodeInformer(nodeInformer) + r.eventWatcher.SetProcessEventCallback( + func(ctx context.Context, event *model.HealthEventWithStatus) *model.Status { + return r.ProcessEvent(ctx, event, ruleSetEvals, rulesetsConfig) + }, + ) + + if err := r.eventWatcher.Start(ctx); err != nil { + return fmt.Errorf("event watcher failed: %w", err) } - ruleSetEvals, err := evaluator.InitializeRuleSetEvaluators(r.config.TomlConfig.RuleSets, - r.config.K8sClient.GetK8sClient(), nodeInformer) + slog.Info("Event watcher stopped, exiting fault-quarantine reconciler.") + + return nil +} + +// setupNodeInformerCallbacks configures callbacks on the already-created node informer +func (r *Reconciler) setupNodeInformerCallbacks() { + r.k8sClient.NodeInformer.SetOnQuarantinedNodeDeletedCallback(func(nodeName string) { + metrics.CurrentQuarantinedNodes.WithLabelValues(nodeName).Set(0) + slog.Info("Set currentQuarantinedNodes to 0 for deleted quarantined node", "node", nodeName) + }) + + r.k8sClient.NodeInformer.SetOnManualUncordonCallback(r.handleManualUncordon) +} + +// initializeRuleSetEvaluators initializes all rule set evaluators from config +func (r *Reconciler) initializeRuleSetEvaluators() ([]evaluator.RuleSetEvaluatorIface, error) { + ruleSetEvals, err := evaluator.InitializeRuleSetEvaluators(r.config.TomlConfig.RuleSets, r.k8sClient.NodeInformer) if err != nil { - return fmt.Errorf("failed to initialize all rule set evaluators: %w", err) + return nil, fmt.Errorf("failed to initialize all rule set evaluators: %w", err) } + return ruleSetEvals, nil +} + +// setupLabelKeys configures label keys for cordon/uncordon tracking +func (r *Reconciler) setupLabelKeys() { r.SetLabelKeys(r.config.TomlConfig.LabelPrefix) + r.k8sClient.SetLabelKeys(r.cordonedReasonLabelKey, r.uncordonedReasonLabelKey) +} +// buildRulesetsConfig builds the rulesets configuration maps from TOML config +func (r *Reconciler) buildRulesetsConfig() rulesetsConfig { taintConfigMap := make(map[string]*config.Taint) cordonConfigMap := make(map[string]bool) ruleSetPriorityMap := make(map[string]int) - // map ruleset name to taint and cordon configs for _, ruleSet := range r.config.TomlConfig.RuleSets { if ruleSet.Taint.Key != "" { taintConfigMap[ruleSet.Name] = &ruleSet.Taint @@ -201,459 +214,339 @@ func (r *Reconciler) Start(ctx context.Context) error { } } - rulesetsConfig := rulesetsConfig{ + return rulesetsConfig{ TaintConfigMap: taintConfigMap, CordonConfigMap: cordonConfigMap, RuleSetPriorityMap: ruleSetPriorityMap, } +} - watcher, err := storewatcher.NewChangeStreamWatcher( - ctx, - r.config.MongoHealthEventCollectionConfig, - r.config.TokenConfig, - r.config.MongoPipeline, - ) - if err != nil { - return fmt.Errorf("failed to create MongoDB change stream watcher: %w", err) +// precomputeTaintInitKeys pre-computes taint keys from rulesets for efficient map initialization +func (r *Reconciler) precomputeTaintInitKeys( + ruleSetEvals []evaluator.RuleSetEvaluatorIface, + rulesetsConfig rulesetsConfig, +) { + r.taintInitKeys = make([]keyValTaint, 0, len(ruleSetEvals)) + + for _, eval := range ruleSetEvals { + taintConfig := rulesetsConfig.TaintConfigMap[eval.GetName()] + if taintConfig != nil { + keyVal := keyValTaint{ + Key: taintConfig.Key, + Value: taintConfig.Value, + } + r.taintInitKeys = append(r.taintInitKeys, keyVal) + } } - defer watcher.Close(ctx) - healthEventCollection, err := storewatcher.GetCollectionClient(ctx, r.config.MongoHealthEventCollectionConfig) - if err != nil { - slog.Error( - "Error initializing healthEventCollection client", - "config", r.config.MongoHealthEventCollectionConfig, - "error", err, - ) + slog.Info("Pre-computed taint initialization keys", "count", len(r.taintInitKeys)) +} - return fmt.Errorf("failed to get health event collection client: %w", err) +// initializeQuarantineMetrics initializes metrics for already quarantined nodes +func (r *Reconciler) initializeQuarantineMetrics() { + totalNodes, quarantinedNodesMap, err := r.k8sClient.NodeInformer.GetNodeCounts() + if err != nil { + slog.Error("Failed to get initial node counts", "error", err) + return } - err = r.nodeInfo.BuildQuarantinedNodesMap(r.config.K8sClient.GetK8sClient()) - if err != nil { - return fmt.Errorf("error fetching quarantined nodes: %w", err) - } else { - quarantinedNodesMap := r.nodeInfo.GetQuarantinedNodesCopy() + for nodeName := range quarantinedNodesMap { + metrics.CurrentQuarantinedNodes.WithLabelValues(nodeName).Set(1) + } - for nodeName := range quarantinedNodesMap { - currentQuarantinedNodes.WithLabelValues(nodeName).Inc() - } + slog.Info("Initial state", "totalNodes", totalNodes, "quarantinedNodes", len(quarantinedNodesMap), + "quarantinedNodesMap", quarantinedNodesMap) +} - slog.Info("Initial quarantinedNodesMap", "nodes", quarantinedNodesMap, "count", len(quarantinedNodesMap)) +// checkCircuitBreakerAtStartup checks if circuit breaker is tripped at startup +// Returns error if retry exhaustion occurs (should restart pod) +// Blocks indefinitely if circuit breaker is tripped (wait for manual intervention) +func (r *Reconciler) checkCircuitBreakerAtStartup(ctx context.Context) error { + if !r.config.CircuitBreakerEnabled { + return nil } - err = nodeInformer.Run(ctx.Done()) + tripped, err := r.cb.IsTripped(ctx) if err != nil { - return fmt.Errorf("failed to run NodeInformer: %w", err) - } - - // Wait for NodeInformer cache to sync before processing any events - slog.Info("Waiting for NodeInformer cache to sync before starting event processing...") - - for !nodeInformer.HasSynced() { - select { - case <-ctx.Done(): - slog.Warn("Context cancelled while waiting for node informer sync") - return ctx.Err() - case <-time.After(5 * time.Second): // Check periodically - slog.Info("NodeInformer cache is not synced yet, waiting for 5 seconds") + if errors.Is(err, breaker.ErrRetryExhausted) { + return err } - } - // Build initial node annotations cache - if err := r.buildNodeAnnotationsCache(ctx); err != nil { - // Continue anyway, individual API calls will be made as fallback - return fmt.Errorf("failed to build initial node annotations cache: %w", err) - } + slog.Error("Error checking if circuit breaker is tripped", "error", err) + <-ctx.Done() - // If breaker is enabled and already tripped at startup, halt until restart/manual close - if r.config.CircuitBreakerEnabled { - if tripped, err := r.cb.IsTripped(ctx); err != nil { - slog.Error("Error checking if circuit breaker is tripped", "error", err) - <-ctx.Done() + return fmt.Errorf("circuit breaker check failed: %w", err) + } - return fmt.Errorf("error checking if circuit breaker is tripped: %w", err) - } else if tripped { - slog.Error("Fault Quarantine circuit breaker is TRIPPED. Halting event dequeuing indefinitely.") - <-ctx.Done() + if tripped { + slog.Error("Fault Quarantine circuit breaker is TRIPPED. Halting event dequeuing indefinitely.") + <-ctx.Done() - return fmt.Errorf("circuit breaker is tripped at startup") - } + return fmt.Errorf("circuit breaker is TRIPPED at startup") } - watcher.Start(ctx) - slog.Info("Listening for events on the channel...") - go func() { - r.watchEvents(watcher) - slog.Info("MongoDB event watcher stopped (context cancelled or connection closed)") - }() - - // Start a goroutine to periodically update the unprocessed events metric - go r.updateUnprocessedEventsMetric(ctx, watcher) - - // Process events in the main goroutine - for { - select { - case <-ctx.Done(): - slog.Info("Context canceled. Exiting fault-quarantine event consumer.") - return nil - case <-r.workSignal: // Wait for a signal (semaphore acquired) - // Only check circuit breaker if it's enabled - if r.config.CircuitBreakerEnabled { - if tripped, err := r.cb.IsTripped(ctx); err != nil { - slog.Error("Error checking if circuit breaker is tripped", "error", err) - return fmt.Errorf("error checking if circuit breaker is tripped: %w", err) - } else if tripped { - slog.Error("Circuit breaker TRIPPED. Halting event processing.") - return fmt.Errorf("circuit breaker is tripped") - } - } - // Get current queue length - healthEventBufferLength := r.healthEventBuffer.Length() - if healthEventBufferLength == 0 { - slog.Debug("No events to process, skipping") - continue - } - - slog.Info("Processing batch of events", "count", healthEventBufferLength) - - // Process up to the current queue length - for healthEventIndex := 0; healthEventIndex < healthEventBufferLength; { - slog.Debug("Processing health event at index", "index", healthEventIndex) - - startTime := time.Now() - currentEventInfo, _ := r.healthEventBuffer.Get(healthEventIndex) - - if currentEventInfo == nil { - break - } - - healthEventWithStatus := currentEventInfo.HealthEventWithStatus - eventBson := currentEventInfo.EventBson - - // Check if event was already processed - if healthEventIndex == 0 && currentEventInfo.HasProcessed { - err := r.healthEventBuffer.RemoveAt(healthEventIndex) - if err != nil { - slog.Error("Error removing event", - "checkName", healthEventWithStatus.HealthEvent.CheckName, - "error", err) - - continue - } - - if err := watcher.MarkProcessed(ctx); err != nil { - processingErrors.WithLabelValues("mark_processed_error").Inc() - - slog.Error("Error updating resume token", "error", err) - } else { - slog.Info("Successfully marked event as processed", "node", healthEventWithStatus.HealthEvent.NodeName) - /* - Reason to reset healthEventIndex to 0 is that the current zeroth event is already processed and is deleted from - the array so we need to start from the beginning of the array again hence healthEventIndex is reset to 0 and - healthEventBufferLength is decremented by 1 because the element got deleted from the array on line number 226 - */ - healthEventIndex = 0 - healthEventBufferLength-- - - continue - } - } - - slog.Debug("Processing event %s at index %d", healthEventWithStatus.HealthEvent.CheckName, healthEventIndex) - // Reason to increment healthEventIndex is that we want to process the next event in the next iteration - healthEventIndex++ - - isNodeQuarantined, ruleEvaluationResult := r.handleEvent( - ctx, - healthEventWithStatus, - ruleSetEvals, - rulesetsConfig, - ) - - if ruleEvaluationResult == common.RuleEvaluationRetryAgainInFuture { - slog.Info(" Rule evaluation failed, will revaluate it in next iteration", "event", healthEventWithStatus) - continue - } - - if isNodeQuarantined == nil { - // Status is nil, meaning we intentionally skipped processing this event - // (e.g., healthy event without quarantine annotation or rule evaluation failed) - slog.Debug("Skipped processing event for node, no status update needed", - "node", healthEventWithStatus.HealthEvent.NodeName) - - currentEventInfo.HasProcessed = true - - r.storeEventObjectID(eventBson) - - duration := time.Since(startTime).Seconds() - eventHandlingDuration.Observe(duration) - totalEventsSkipped.Inc() - - continue - } - - // Process events with status - currentEventInfo.HasProcessed = true - - r.storeEventObjectID(eventBson) - - err := r.updateNodeQuarantineStatus(ctx, healthEventCollection, eventBson, isNodeQuarantined) - if err != nil { - slog.Error("Error updating Node quarantine status", "error", err) - processingErrors.WithLabelValues("update_quarantine_status_error").Inc() - } else if *isNodeQuarantined == model.Quarantined || *isNodeQuarantined == model.UnQuarantined { - // Only count as successfully processed if there was an actual state change - // AlreadyQuarantined means the event was skipped (already counted in handleEvent) - totalEventsSuccessfullyProcessed.Inc() - } - - duration := time.Since(startTime).Seconds() - eventHandlingDuration.Observe(duration) - } - } - } + return nil } -// storeEventObjectID extracts the ObjectID from the event and stores it for metric tracking -func (r *Reconciler) storeEventObjectID(eventBson bson.M) { - if fullDoc, ok := eventBson["fullDocument"].(bson.M); ok { - if objID, ok := fullDoc["_id"].(primitive.ObjectID); ok { - r.lastProcessedObjectID.Store(objID) - } +// ProcessEvent processes a single health event +func (r *Reconciler) ProcessEvent( + ctx context.Context, + event *model.HealthEventWithStatus, + ruleSetEvals []evaluator.RuleSetEvaluatorIface, + rulesetsConfig rulesetsConfig, +) *model.Status { + if shouldHalt := r.checkCircuitBreakerAndHalt(ctx); shouldHalt { + return nil } -} -// updateUnprocessedEventsMetric periodically updates the EventBacklogSize metric -// based on the ObjectID of the last processed event -func (r *Reconciler) updateUnprocessedEventsMetric(ctx context.Context, - watcher *storewatcher.ChangeStreamWatcher) { - ticker := time.NewTicker(r.config.UnprocessedEventsMetricUpdateInterval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - lastObjID := r.lastProcessedObjectID.Load() - if lastObjID == nil { - continue - } - - objID, ok := lastObjID.(primitive.ObjectID) - if !ok { - continue - } + slog.Debug("Processing event", "checkName", event.HealthEvent.CheckName) - unprocessedCount, err := watcher.GetUnprocessedEventCount(ctx, objID) - if err != nil { - slog.Debug("Failed to get unprocessed event count", "error", err) - continue - } + isNodeQuarantined := r.handleEvent(ctx, event, ruleSetEvals, rulesetsConfig) - EventBacklogSize.Set(float64(unprocessedCount)) - slog.Debug("Updated unprocessed events metric", "count", unprocessedCount, "objectID", objID.Hex()) - } + if isNodeQuarantined == nil { + // Event was skipped (no quarantine action taken) + slog.Debug("Skipped processing event for node, no status update needed", "node", event.HealthEvent.NodeName) + metrics.TotalEventsSkipped.Inc() + } else if *isNodeQuarantined == model.Quarantined || + *isNodeQuarantined == model.UnQuarantined || + *isNodeQuarantined == model.AlreadyQuarantined { + metrics.TotalEventsSuccessfullyProcessed.Inc() } -} -func (r *Reconciler) watchEvents(watcher *storewatcher.ChangeStreamWatcher) { - for event := range watcher.Events() { - totalEventsReceived.Inc() + return isNodeQuarantined +} - healthEventWithStatus := model.HealthEventWithStatus{} - err := storewatcher.UnmarshalFullDocumentFromEvent( - event, - &healthEventWithStatus, - ) +// checkCircuitBreakerAndHalt checks if circuit breaker is tripped and returns true if processing should halt +func (r *Reconciler) checkCircuitBreakerAndHalt(ctx context.Context) bool { + if !r.config.CircuitBreakerEnabled { + return false + } - if err != nil { - slog.Error("Failed to unmarshal event", "error", err) - processingErrors.WithLabelValues("unmarshal_error").Inc() + tripped, err := r.cb.IsTripped(ctx) + if err != nil { + slog.Error("Error checking if circuit breaker is tripped", "error", err) + <-ctx.Done() - continue - } + return true + } - slog.Debug("Enqueuing event", "event", healthEventWithStatus) - r.healthEventBuffer.Add(&healthEventWithStatus, event) + if tripped { + slog.Error("Circuit breaker TRIPPED. Halting event processing until restart and breaker reset.") + <-ctx.Done() - select { - case r.workSignal <- struct{}{}: - slog.Debug("Signalled work channel for new health event") - default: - slog.Debug("Work channel already signalled, skipping duplicate signal") - } + return true } + + return false } -//nolint:cyclop,gocognit,nestif //fix this as part of NGCC-21793 func (r *Reconciler) handleEvent( ctx context.Context, event *model.HealthEventWithStatus, ruleSetEvals []evaluator.RuleSetEvaluatorIface, rulesetsConfig rulesetsConfig, -) (*model.Status, common.RuleEvaluationResult) { - var status model.Status +) *model.Status { + annotations, quarantineAnnotationExists := r.hasExistingQuarantine(event.HealthEvent.NodeName) - quarantineAnnotationExists := false + if quarantineAnnotationExists { + return r.handleAlreadyQuarantinedNode(ctx, event.HealthEvent, ruleSetEvals) + } - // Get quarantine annotations from cache or API fallback - annotations, annErr := r.getNodeQuarantineAnnotations(ctx, event.HealthEvent.NodeName) - if annErr != nil { - slog.Error("failed to fetch annotations for node %s: %+v", - event.HealthEvent.NodeName, annErr) + // For healthy events, if there's no existing quarantine annotation, + // skip processing as there's no transition from unhealthy to healthy + if event.HealthEvent.IsHealthy { + slog.Info("Skipping healthy event for node as there's no existing quarantine annotation", + "node", event.HealthEvent.NodeName, "event", event.HealthEvent) + + return nil } - if annErr == nil && annotations != nil { - annotationVal, exists := annotations[common.QuarantineHealthEventAnnotationKey] + taintAppliedMap := make(map[keyValTaint]string, len(r.taintInitKeys)) + taintEffectPriorityMap := make(map[keyValTaint]int, len(r.taintInitKeys)) - if exists && annotationVal != "" { - quarantineAnnotationExists = true - } + for _, keyVal := range r.taintInitKeys { + taintAppliedMap[keyVal] = "" + taintEffectPriorityMap[keyVal] = -1 } - if quarantineAnnotationExists { - // The node was already quarantined by FQM earlier. Delegate to the - // specialized handler which decides whether to keep it quarantined or - // un-quarantine based on the incoming event. - if r.handleQuarantinedNode(ctx, event.HealthEvent) { - totalEventsSkipped.Inc() + var labelsMap sync.Map - status = model.AlreadyQuarantined - } else { - status = model.UnQuarantined - } + var isCordoned atomic.Bool - return &status, common.RuleEvaluationNotApplicable + r.evaluateRulesets( + event, ruleSetEvals, rulesetsConfig, + taintAppliedMap, &labelsMap, &isCordoned, taintEffectPriorityMap, + ) + + taintsToBeApplied := r.collectTaintsToApply(taintAppliedMap) + + annotationsMap := r.prepareAnnotations(taintsToBeApplied, &labelsMap, &isCordoned) + + isNodeQuarantined := len(taintsToBeApplied) > 0 || isCordoned.Load() + if !isNodeQuarantined { + return nil } - // For healthy events, if there's no existing quarantine annotation, - // skip processing as there's no transition from unhealthy to healthy - if event.HealthEvent.IsHealthy && !quarantineAnnotationExists { - slog.Info("Skipping healthy event", - "node", event.HealthEvent.NodeName, - "event", event.HealthEvent) + return r.applyQuarantine(ctx, event, annotations, taintsToBeApplied, annotationsMap, &labelsMap, &isCordoned) +} - return nil, common.RuleEvaluationNotApplicable +func (r *Reconciler) hasExistingQuarantine(nodeName string) (map[string]string, bool) { + annotations, err := r.getNodeQuarantineAnnotations(nodeName) + if err != nil { + slog.Error("Failed to fetch annotations for node", "node", nodeName, "error", err) + return make(map[string]string), false } - type keyValTaint struct { - Key string - Value string + if annotations == nil { + return make(map[string]string), false } - var taintAppliedMap sync.Map + annotationVal, exists := annotations[common.QuarantineHealthEventAnnotationKey] - var labelsMap sync.Map + return annotations, exists && annotationVal != "" +} - var isCordoned atomic.Bool +// handleAlreadyQuarantinedNode handles events for nodes that are already quarantined +func (r *Reconciler) handleAlreadyQuarantinedNode( + ctx context.Context, + event *protos.HealthEvent, + ruleSetEvals []evaluator.RuleSetEvaluatorIface, +) *model.Status { + var status model.Status - var taintEffectPriorityMap sync.Map + if r.handleQuarantinedNode(ctx, event, ruleSetEvals) { + status = model.AlreadyQuarantined + } else { + status = model.UnQuarantined + } - ruleEvaluationRetryInFuture := false + return &status +} - for _, eval := range ruleSetEvals { - taintConfig := rulesetsConfig.TaintConfigMap[eval.GetName()] - if taintConfig != nil { - keyVal := keyValTaint{ - Key: taintConfig.Key, - Value: taintConfig.Value, - } - // initialize maps - taintAppliedMap.Store(keyVal, "") - taintEffectPriorityMap.Store(keyVal, -1) - } - } +// evaluateRulesets evaluates all rulesets against the health event in parallel +func (r *Reconciler) evaluateRulesets( + event *model.HealthEventWithStatus, + ruleSetEvals []evaluator.RuleSetEvaluatorIface, + rulesetsConfig rulesetsConfig, + taintAppliedMap map[keyValTaint]string, + labelsMap *sync.Map, + isCordoned *atomic.Bool, + taintEffectPriorityMap map[keyValTaint]int, +) { + // Handle quarantine override (force quarantine without rule evaluation) + if event.HealthEvent.QuarantineOverrides != nil && event.HealthEvent.QuarantineOverrides.Force { + isCordoned.Store(true) - var wg sync.WaitGroup + creatorID := event.HealthEvent.Metadata["creator_id"] + labelsMap.LoadOrStore(r.cordonedByLabelKey, event.HealthEvent.Agent+"-"+creatorID) + labelsMap.Store(r.cordonedReasonLabelKey, + formatCordonOrUncordonReasonValue(event.HealthEvent.Message, 63)) - if event.HealthEvent.QuarantineOverrides == nil || - !event.HealthEvent.QuarantineOverrides.Force { - // Evaluate each ruleset in parallel - for _, eval := range ruleSetEvals { - wg.Add(1) + return + } - go func(eval evaluator.RuleSetEvaluatorIface) { - defer wg.Done() - slog.Info("Handling event for ruleset", "event", event, "ruleset", eval.GetName()) + var wg sync.WaitGroup - rulesetEvaluations.WithLabelValues(eval.GetName()).Inc() + for _, eval := range ruleSetEvals { + wg.Add(1) - ruleEvaluatedResult, err := eval.Evaluate(event.HealthEvent) - //nolint //ignore complex nesting blocks //fix this as part of NGCC-21793 - if ruleEvaluatedResult == common.RuleEvaluationSuccess { - rulesetPassed.WithLabelValues(eval.GetName()).Inc() + go func(eval evaluator.RuleSetEvaluatorIface) { + defer wg.Done() - if shouldCordon := rulesetsConfig.CordonConfigMap[eval.GetName()]; shouldCordon { - isCordoned.Store(true) + slog.Info("Handling event for ruleset", "event", event, "ruleset", eval.GetName()) - newCordonReason := eval.GetName() + metrics.RulesetEvaluations.WithLabelValues(eval.GetName()).Inc() - if _, exist := labelsMap.Load(cordonedReasonLabelKey); exist { - oldCordonReason, _ := labelsMap.Load(cordonedReasonLabelKey) - newCordonReason = oldCordonReason.(string) + "-" + newCordonReason - } + ruleEvaluatedResult, err := eval.Evaluate(event.HealthEvent) - labelsMap.Store(cordonedReasonLabelKey, formatCordonOrUncordonReasonValue(newCordonReason, 63)) - } + switch { + case ruleEvaluatedResult == common.RuleEvaluationSuccess: + r.handleSuccessfulRuleEvaluation( + eval, rulesetsConfig, labelsMap, isCordoned, taintAppliedMap, taintEffectPriorityMap) + case err != nil: + r.handleRuleEvaluationError(event.HealthEvent, eval.GetName(), err) + default: + metrics.RulesetFailed.WithLabelValues(eval.GetName()).Inc() + } + }(eval) + } - taintConfig := rulesetsConfig.TaintConfigMap[eval.GetName()] - // Apply taint and cordon based on configuration, if it is not already applied - if taintConfig != nil { - keyVal := keyValTaint{Key: taintConfig.Key, Value: taintConfig.Value} + wg.Wait() +} - currentVal, _ := taintAppliedMap.Load(keyVal) - currentEffect := currentVal.(string) +// handleSuccessfulRuleEvaluation processes a successful rule evaluation result +func (r *Reconciler) handleSuccessfulRuleEvaluation( + eval evaluator.RuleSetEvaluatorIface, + rulesetsConfig rulesetsConfig, + labelsMap *sync.Map, + isCordoned *atomic.Bool, + taintAppliedMap map[keyValTaint]string, + taintEffectPriorityMap map[keyValTaint]int, +) { + metrics.RulesetPassed.WithLabelValues(eval.GetName()).Inc() - currentPriorityVal, _ := taintEffectPriorityMap.Load(keyVal) - currentPriority := currentPriorityVal.(int) + shouldCordon := rulesetsConfig.CordonConfigMap[eval.GetName()] + if shouldCordon { + isCordoned.Store(true) - newPriority := rulesetsConfig.RuleSetPriorityMap[eval.GetName()] + newCordonReason := eval.GetName() - // Update if no effect set yet or new priority is higher - if currentEffect == "" || (currentEffect != "" && newPriority > currentPriority) { - taintEffectPriorityMap.Store(keyVal, newPriority) - taintAppliedMap.Store(keyVal, taintConfig.Effect) - } - } - } else if err != nil { - slog.Error("Error while evaluating event for ruleset", "event", event.HealthEvent, "ruleset", eval.GetName(), "error", err) + if oldReasonVal, exist := labelsMap.Load(r.cordonedReasonLabelKey); exist { + oldCordonReason := oldReasonVal.(string) + newCordonReason = oldCordonReason + "-" + newCordonReason + } - processingErrors.WithLabelValues("ruleset_evaluation_error").Inc() + labelsMap.Store(r.cordonedReasonLabelKey, formatCordonOrUncordonReasonValue(newCordonReason, 63)) + } - rulesetFailed.WithLabelValues(eval.GetName()).Inc() - } else if ruleEvaluatedResult == common.RuleEvaluationRetryAgainInFuture { + taintConfig := rulesetsConfig.TaintConfigMap[eval.GetName()] + if taintConfig != nil { + r.updateTaintMaps(eval.GetName(), taintConfig, rulesetsConfig, taintAppliedMap, taintEffectPriorityMap) + } +} - slog.Debug("Rule evaluation not succeeded, will re-evaluate in next iteration", "event", event.HealthEvent) - ruleEvaluationRetryInFuture = true +// updateTaintMaps updates taint maps with priority-based logic to handle multiple rulesets +// affecting the same taint key-value pair. +func (r *Reconciler) updateTaintMaps( + evalName string, + taintConfig *config.Taint, + rulesetsConfig rulesetsConfig, + taintAppliedMap map[keyValTaint]string, + taintEffectPriorityMap map[keyValTaint]int, +) { + keyVal := keyValTaint{Key: taintConfig.Key, Value: taintConfig.Value} + newPriority := rulesetsConfig.RuleSetPriorityMap[evalName] - } else { - rulesetFailed.WithLabelValues(eval.GetName()).Inc() - } - }(eval) - } + r.taintUpdateMu.Lock() + defer r.taintUpdateMu.Unlock() - wg.Wait() + currentEffect := taintAppliedMap[keyVal] + currentPriority := taintEffectPriorityMap[keyVal] - if ruleEvaluationRetryInFuture { - return nil, common.RuleEvaluationRetryAgainInFuture - } - } else { - isCordoned.Store(true) - labelsMap.LoadOrStore(cordonedByLabelKey, event.HealthEvent.Agent+"-"+event.HealthEvent.Metadata["creator_id"]) - labelsMap.Store(cordonedReasonLabelKey, - formatCordonOrUncordonReasonValue(event.HealthEvent.Message, 63)) + if currentEffect == "" || newPriority > currentPriority { + taintEffectPriorityMap[keyVal] = newPriority + taintAppliedMap[keyVal] = taintConfig.Effect } +} - taintsToBeApplied := []config.Taint{} - // Check the taint map and collect the taints which are to be applied - taintAppliedMap.Range(func(k, v interface{}) bool { - keyVal := k.(keyValTaint) - effect := v.(string) +// handleRuleEvaluationError handles errors during rule evaluation +func (r *Reconciler) handleRuleEvaluationError( + event *protos.HealthEvent, + evalName string, + err error, +) { + slog.Error("Rule evaluation failed", "ruleset", evalName, "node", event.NodeName, "error", err) + metrics.ProcessingErrors.WithLabelValues("ruleset_evaluation_error").Inc() + metrics.RulesetFailed.WithLabelValues(evalName).Inc() +} + +// collectTaintsToApply collects all taints that should be applied from the taint map +func (r *Reconciler) collectTaintsToApply(taintAppliedMap map[keyValTaint]string) []config.Taint { + taintsToBeApplied := make([]config.Taint, 0, len(taintAppliedMap)) + for keyVal, effect := range taintAppliedMap { if effect != "" { taintsToBeApplied = append(taintsToBeApplied, config.Taint{ Key: keyVal.Key, @@ -661,160 +554,217 @@ func (r *Reconciler) handleEvent( Effect: effect, }) } + } - return true - }) + return taintsToBeApplied +} - // collect annotations to be applied if any +// prepareAnnotations prepares annotations and labels to be applied if any +func (r *Reconciler) prepareAnnotations( + taintsToBeApplied []config.Taint, + labelsMap *sync.Map, + isCordoned *atomic.Bool, +) map[string]string { annotationsMap := map[string]string{} if len(taintsToBeApplied) > 0 { - // store the taints applied as an annotation taintsJsonStr, err := json.Marshal(taintsToBeApplied) if err != nil { - slog.Error("Error marshalling taints", "taints", taintsToBeApplied, "event", event, "error", err) + slog.Error("Failed to marshal taints for annotation", "error", err) } else { annotationsMap[common.QuarantineHealthEventAppliedTaintsAnnotationKey] = string(taintsJsonStr) } } if isCordoned.Load() { - // store cordon as an annotation annotationsMap[common.QuarantineHealthEventIsCordonedAnnotationKey] = common.QuarantineHealthEventIsCordonedAnnotationValueTrue - labelsMap.LoadOrStore(cordonedByLabelKey, common.ServiceName) - - labelsMap.Store(cordonedTimestampLabelKey, time.Now().UTC().Format("2006-01-02T15-04-05Z")) + labelsMap.LoadOrStore(r.cordonedByLabelKey, common.ServiceName) + labelsMap.Store(r.cordonedTimestampLabelKey, time.Now().UTC().Format("2006-01-02T15-04-05Z")) labelsMap.Store(string(statemanager.NVSentinelStateLabelKey), string(statemanager.QuarantinedLabelValue)) } - isNodeQuarantined := (len(taintsToBeApplied) > 0 || isCordoned.Load()) + return annotationsMap +} - //nolint //ignore complex nested block //fix this as part of NGCC-21793 - if isNodeQuarantined { - // Record an event to sliding window before actually quarantining - if r.config.CircuitBreakerEnabled && (event.HealthEvent.QuarantineOverrides == nil || - !event.HealthEvent.QuarantineOverrides.Force) { - r.cb.AddCordonEvent(event.HealthEvent.NodeName) - } +// applyQuarantine applies quarantine actions to a node (taints, cordon, annotations) +func (r *Reconciler) applyQuarantine( + ctx context.Context, + event *model.HealthEventWithStatus, + annotations map[string]string, + taintsToBeApplied []config.Taint, + annotationsMap map[string]string, + labelsMap *sync.Map, + isCordoned *atomic.Bool, +) *model.Status { + r.recordCordonEventInCircuitBreaker(event) - // Create health events structure for the new quarantine with sanitized health event - healthEvents := healthEventsAnnotation.NewHealthEventsAnnotationMap() - updated := healthEvents.AddOrUpdateEvent(event.HealthEvent) + healthEvents := healthEventsAnnotation.NewHealthEventsAnnotationMap() + updated := healthEvents.AddOrUpdateEvent(event.HealthEvent) - if !updated { - slog.Info("Health event already exists for node, skipping quarantine", "event", event.HealthEvent, "node", event.HealthEvent.NodeName) - return nil, common.RuleEvaluationNotApplicable - } + if !updated { + slog.Info("Health event already exists for node, skipping quarantine", + "event", event.HealthEvent, "node", event.HealthEvent.NodeName) - eventJsonStr, err := json.Marshal(healthEvents) - if err != nil { - slog.Error("Error marshalling health events", "error", err) - } else { - annotationsMap[common.QuarantineHealthEventAnnotationKey] = string(eventJsonStr) - } + return nil + } - labels := map[string]string{} - labelsMap.Range(func(key, value any) bool { - strKey, okKey := key.(string) - strValue, okValue := value.(string) - if okKey && okValue { + if err := r.addHealthEventAnnotation(healthEvents, annotationsMap); err != nil { + return nil + } + + // Remove manual uncordon annotation if present before applying new quarantine + r.cleanupManualUncordonAnnotation(ctx, event.HealthEvent.NodeName, annotations) + + if !r.config.CircuitBreakerEnabled { + slog.Info("Circuit breaker is disabled, proceeding with quarantine action without protection", + "node", event.HealthEvent.NodeName) + } + + // Convert sync.Map to regular map for K8s API call + labels := make(map[string]string) + + labelsMap.Range(func(key, value any) bool { + if strKey, ok := key.(string); ok { + if strValue, ok := value.(string); ok { labels[strKey] = strValue } - return true - }) + } - // Remove manual uncordon annotation if present before applying new quarantine - r.removeManualUncordonAnnotationIfPresent(ctx, event.HealthEvent.NodeName, annotations) + return true + }) - if !r.config.CircuitBreakerEnabled { - slog.Info("Circuit breaker is disabled, proceeding with quarantine action for node without circuit breaker protection", "node", event.HealthEvent.NodeName) - } + err := r.k8sClient.QuarantineNodeAndSetAnnotations( + ctx, + event.HealthEvent.NodeName, + taintsToBeApplied, + isCordoned.Load(), + annotationsMap, + labels, + ) + if err != nil { + slog.Error("Failed to taint and cordon node", "node", event.HealthEvent.NodeName, "error", err) + metrics.ProcessingErrors.WithLabelValues("taint_and_cordon_error").Inc() - if err := r.config.K8sClient.TaintAndCordonNodeAndSetAnnotations( - ctx, - event.HealthEvent.NodeName, - taintsToBeApplied, - isCordoned.Load(), - annotationsMap, - labels, - ); err != nil { - slog.Error("Error updating node", "event", event.HealthEvent, "error", err) + return nil + } - processingErrors.WithLabelValues("taint_and_cordon_error").Inc() + r.updateQuarantineMetrics(event.HealthEvent.NodeName, taintsToBeApplied, isCordoned) - isNodeQuarantined = false - } else { - totalNodesQuarantined.WithLabelValues(event.HealthEvent.NodeName).Inc() - currentQuarantinedNodes.WithLabelValues(event.HealthEvent.NodeName).Inc() + status := model.Quarantined - // Update cache with the new annotations that were just added to the node - // This ensures subsequent events in the same batch see the updated annotations - r.updateCacheWithQuarantineAnnotations(event.HealthEvent.NodeName, annotationsMap) + return &status +} - // update the map here so that later we can refer to it and update the quarantined nodes - r.nodeInfo.MarkNodeQuarantineStatusCache(event.HealthEvent.NodeName, isNodeQuarantined, true) +// recordCordonEventInCircuitBreaker records a cordon event in the circuit breaker if enabled +func (r *Reconciler) recordCordonEventInCircuitBreaker(event *model.HealthEventWithStatus) { + if r.config.CircuitBreakerEnabled && + (event.HealthEvent.QuarantineOverrides == nil || !event.HealthEvent.QuarantineOverrides.Force) { + r.cb.AddCordonEvent(event.HealthEvent.NodeName) + } +} - for _, taint := range taintsToBeApplied { - taintsApplied.WithLabelValues(taint.Key, taint.Effect).Inc() - } +// addHealthEventAnnotation adds health event annotation to the annotations map +func (r *Reconciler) addHealthEventAnnotation( + healthEvents *healthEventsAnnotation.HealthEventsAnnotationMap, + annotationsMap map[string]string, +) error { + eventJsonStr, err := json.Marshal(healthEvents) + if err != nil { + return fmt.Errorf("failed to marshal health events: %w", err) + } - if isCordoned.Load() { - cordonsApplied.Inc() - } + annotationsMap[common.QuarantineHealthEventAnnotationKey] = string(eventJsonStr) + + return nil +} + +// updateQuarantineMetrics updates Prometheus metrics after quarantining a node +func (r *Reconciler) updateQuarantineMetrics( + nodeName string, + taintsToBeApplied []config.Taint, + isCordoned *atomic.Bool, +) { + metrics.TotalNodesQuarantined.WithLabelValues(nodeName).Inc() + metrics.CurrentQuarantinedNodes.WithLabelValues(nodeName).Set(1) + + for _, taint := range taintsToBeApplied { + metrics.TaintsApplied.WithLabelValues(taint.Key, taint.Effect).Inc() + } + + if isCordoned.Load() { + metrics.CordonsApplied.Inc() + } +} + +// eventMatchesAnyRule checks if an event matches at least one configured ruleset +func (r *Reconciler) eventMatchesAnyRule( + event *protos.HealthEvent, + ruleSetEvals []evaluator.RuleSetEvaluatorIface, +) bool { + for _, eval := range ruleSetEvals { + result, err := eval.Evaluate(event) + if err != nil { + continue + } + + if result == common.RuleEvaluationSuccess { + return true } } - if isNodeQuarantined { - status = model.Quarantined + return false +} + +// handleUnhealthyEventOnQuarantinedNode handles unhealthy events on already-quarantined nodes +func (r *Reconciler) handleUnhealthyEventOnQuarantinedNode( + ctx context.Context, + event *protos.HealthEvent, + ruleSetEvals []evaluator.RuleSetEvaluatorIface, + healthEventsAnnotationMap *healthEventsAnnotation.HealthEventsAnnotationMap, +) bool { + if !r.eventMatchesAnyRule(event, ruleSetEvals) { + slog.Info("Unhealthy event on node doesn't match any rules, skipping annotation update", + "checkName", event.CheckName, "node", event.NodeName) + return true + } + + added := healthEventsAnnotationMap.AddOrUpdateEvent(event) + + if added { + slog.Info("Added entity failures for check on node", + "checkName", event.CheckName, "node", event.NodeName, "totalTrackedEntities", healthEventsAnnotationMap.Count()) + + if err := r.addEventToAnnotation(ctx, event); err != nil { + slog.Error("Failed to update health events annotation", "error", err) + return true + } } else { - return nil, common.RuleEvaluationNotApplicable + slog.Debug("All entities already tracked for check on node", + "checkName", event.CheckName, "node", event.NodeName) } - return &status, common.RuleEvaluationNotApplicable + return true } func (r *Reconciler) handleQuarantinedNode( ctx context.Context, event *protos.HealthEvent, + ruleSetEvals []evaluator.RuleSetEvaluatorIface, ) bool { - // Get and validate health events quarantine annotations - healthEventsAnnotationMap, annotations, err := r.getAndValidateHealthEventsQuarantineAnnotations(ctx, event) + healthEventsAnnotationMap, annotations, err := r.getHealthEventsFromAnnotation(event) if err != nil { - processingErrors.WithLabelValues("get_node_annotations_error").Inc() - // Error cases return true to keep node quarantined, or false if no annotation exists - return err.Error() != "no quarantine annotation" + metrics.ProcessingErrors.WithLabelValues("get_node_annotations_error").Inc() + return !errors.Is(err, errNoQuarantineAnnotation) } - // Check if any entities from this event are already tracked _, hasExistingCheck := healthEventsAnnotationMap.GetEvent(event) if !event.IsHealthy { - // Handle unhealthy event - add new entity failures - added := healthEventsAnnotationMap.AddOrUpdateEvent(event) - - if added { - slog.Info("Added entity failures for check on node", - "check", event.CheckName, - "node", event.NodeName, - "trackedEntities", healthEventsAnnotationMap.Count()) - - // Update the annotation with the new entity failures - if err := r.updateHealthEventsQuarantineAnnotation(ctx, event.NodeName, healthEventsAnnotationMap); err != nil { - slog.Error("Failed to update health events annotation", "error", err) - return true - } - } else { - slog.Debug("All entities already tracked for check %s on node %s", - event.CheckName, event.NodeName) - } - - // Node remains quarantined - return true + return r.handleUnhealthyEventOnQuarantinedNode(ctx, event, ruleSetEvals, healthEventsAnnotationMap) } - // Handle healthy event if !hasExistingCheck { slog.Debug("Received healthy event for untracked check %s on node %s (other checks may still be failing)", event.CheckName, event.NodeName) @@ -836,21 +786,18 @@ func (r *Reconciler) handleQuarantinedNode( event.CheckName, event.NodeName) } - // Check if all checks have recovered if healthEventsAnnotationMap.IsEmpty() { - // All checks recovered - uncordon the node slog.Info("All health checks recovered for node, proceeding with uncordon", "node", event.NodeName) return r.performUncordon(ctx, event, annotations) } - // Update the annotation with the modified health events structure - if err := r.updateHealthEventsQuarantineAnnotation(ctx, event.NodeName, healthEventsAnnotationMap); err != nil { + // Remove this event's entities from the node's annotation + if err := r.removeEventFromAnnotation(ctx, event); err != nil { slog.Error("Failed to update health events annotation after recovery", "error", err) return true } - // Node remains quarantined as there are still failing checks slog.Info("Node remains quarantined with failing checks", "node", event.NodeName, "failingChecksCount", healthEventsAnnotationMap.Count(), @@ -859,79 +806,132 @@ func (r *Reconciler) handleQuarantinedNode( return true } -func (r *Reconciler) getAndValidateHealthEventsQuarantineAnnotations( - ctx context.Context, +func (r *Reconciler) getHealthEventsFromAnnotation( event *protos.HealthEvent, ) (*healthEventsAnnotation.HealthEventsAnnotationMap, map[string]string, error) { - annotations, err := r.getNodeQuarantineAnnotations(ctx, event.NodeName) + annotations, err := r.getNodeQuarantineAnnotations(event.NodeName) if err != nil { - slog.Error("Error getting node annotations", "event", event, "error", err) - processingErrors.WithLabelValues("get_node_annotations_error").Inc() + slog.Error("Failed to get node annotations for node", "node", event.NodeName, "error", err) + metrics.ProcessingErrors.WithLabelValues("get_node_annotations_error").Inc() - return nil, nil, fmt.Errorf("failed to get annotations") + return nil, nil, fmt.Errorf("failed to get annotations: %w", err) } quarantineAnnotationStr, exists := annotations[common.QuarantineHealthEventAnnotationKey] if !exists || quarantineAnnotationStr == "" { slog.Info("No quarantine annotation found for node", "node", event.NodeName) - return nil, nil, fmt.Errorf("no quarantine annotation") + return nil, nil, errNoQuarantineAnnotation } // Try to unmarshal as HealthEventsAnnotationMap first var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap - err = json.Unmarshal([]byte(quarantineAnnotationStr), &healthEventsMap) + if err != nil { - // Fallback: try to unmarshal as single HealthEvent for backward compatibility var singleHealthEvent protos.HealthEvent if err2 := json.Unmarshal([]byte(quarantineAnnotationStr), &singleHealthEvent); err2 == nil { - // Convert single event to health events structure - slog.Info("Converting single health event to health events structure for node", "node", event.NodeName) + slog.Info("Found old format annotation for node, converting locally", "node", event.NodeName) healthEventsMap = *healthEventsAnnotation.NewHealthEventsAnnotationMap() healthEventsMap.AddOrUpdateEvent(&singleHealthEvent) - - // Update the annotation to new format for consistency - if err := r.updateHealthEventsQuarantineAnnotation(ctx, event.NodeName, &healthEventsMap); err != nil { - slog.Warn("Failed to update annotation to new format", "error", err) - } } else { - slog.Error("error unmarshalling annotation for node %s: %+v", event.NodeName, err) - return nil, nil, fmt.Errorf("failed to unmarshal annotation") + return nil, nil, fmt.Errorf("failed to unmarshal annotation for node %s: %w", event.NodeName, err) } } return &healthEventsMap, annotations, nil } -func (r *Reconciler) updateHealthEventsQuarantineAnnotation( +// addEventToAnnotation adds or updates a health event in the node's quarantine annotation +func (r *Reconciler) addEventToAnnotation( ctx context.Context, - nodeName string, - healthEvents *healthEventsAnnotation.HealthEventsAnnotationMap, + event *protos.HealthEvent, ) error { - annotationBytes, err := json.Marshal(healthEvents) - if err != nil { - slog.Error("Error marshalling health events annotation", "error", err) - return fmt.Errorf("failed to marshal health events: %w", err) - } + updateFn := func(node *corev1.Node) error { + if node.Annotations == nil { + node.Annotations = make(map[string]string) + } - annotationsToUpdate := map[string]string{ - common.QuarantineHealthEventAnnotationKey: string(annotationBytes), - } + healthEventsMap := healthEventsAnnotation.NewHealthEventsAnnotationMap() + existingAnnotation := node.Annotations[common.QuarantineHealthEventAnnotationKey] - if err := r.config.K8sClient.UpdateNodeAnnotations(ctx, nodeName, annotationsToUpdate); err != nil { - slog.Error("Error updating node annotations for multi-event", "error", err) - return fmt.Errorf("failed to update node annotations for multi-event on %s: %w", nodeName, err) + if existingAnnotation != "" { + if err := json.Unmarshal([]byte(existingAnnotation), healthEventsMap); err != nil { + var singleEvent protos.HealthEvent + if err2 := json.Unmarshal([]byte(existingAnnotation), &singleEvent); err2 == nil { + healthEventsMap.AddOrUpdateEvent(&singleEvent) + } else { + return fmt.Errorf("failed to parse existing annotation (tried both formats): %w", err) + } + } + } + + added := healthEventsMap.AddOrUpdateEvent(event) + if !added { + slog.Debug("Event already exists for node, no annotation update needed", "node", event.NodeName) + return nil + } + + annotationBytes, err := json.Marshal(healthEventsMap) + if err != nil { + return fmt.Errorf("failed to marshal health events: %w", err) + } + + node.Annotations[common.QuarantineHealthEventAnnotationKey] = string(annotationBytes) + + slog.Debug("Added/updated event for node", "node", event.NodeName, "totalEntityLevelEvents", healthEventsMap.Count()) + + return nil } - slog.Info("Updated health events quarantine annotation for node %s - %d checks tracked", - nodeName, healthEvents.Count()) + return r.k8sClient.UpdateNode(ctx, event.NodeName, updateFn) +} + +// removeEventFromAnnotation removes entities from a health event in the node's quarantine annotation +func (r *Reconciler) removeEventFromAnnotation( + ctx context.Context, + event *protos.HealthEvent, +) error { + updateFn := func(node *corev1.Node) error { + if node.Annotations == nil { + return nil + } + + existingAnnotation, exists := node.Annotations[common.QuarantineHealthEventAnnotationKey] + if !exists || existingAnnotation == "" { + return nil + } + + healthEventsMap := healthEventsAnnotation.NewHealthEventsAnnotationMap() + if err := json.Unmarshal([]byte(existingAnnotation), healthEventsMap); err != nil { + var singleEvent protos.HealthEvent + if err2 := json.Unmarshal([]byte(existingAnnotation), &singleEvent); err2 == nil { + healthEventsMap.AddOrUpdateEvent(&singleEvent) + } else { + return fmt.Errorf("failed to parse existing annotation (tried both formats): %w", err) + } + } + + removed := healthEventsMap.RemoveEvent(event) + if removed == 0 { + slog.Debug("No matching entities to remove for node, no annotation update needed", "node", event.NodeName) + return nil + } - // Update cache - r.updateCacheWithQuarantineAnnotations(nodeName, annotationsToUpdate) + annotationBytes, err := json.Marshal(healthEventsMap) + if err != nil { + return fmt.Errorf("failed to marshal health events after removal: %w", err) + } - return nil + node.Annotations[common.QuarantineHealthEventAnnotationKey] = string(annotationBytes) + + slog.Debug("Removed entities for node", "node", event.NodeName, "remainingEntityLevelEvents", healthEventsMap.Count()) + + return nil + } + + return r.k8sClient.UpdateNode(ctx, event.NodeName, updateFn) } func (r *Reconciler) performUncordon( @@ -946,38 +946,47 @@ func (r *Reconciler) performUncordon( taintsToBeRemoved, annotationsToBeRemoved, isUnCordon, labelsMap, err := r.prepareUncordonParams( event, annotations) if err != nil { - slog.Error("Error preparing uncordon params", "event", event, "error", err) + slog.Error("Failed to prepare uncordon params for node", "node", event.NodeName, "error", err) return true } - // Nothing to uncordon if len(taintsToBeRemoved) == 0 && !isUnCordon { return false } - // Add the main quarantine annotation to removal list + if !isUnCordon { + slog.Warn("Node is not cordoned but has quarantine taints/annotations, proceeding with cleanup", + "node", event.NodeName) + } + annotationsToBeRemoved = append(annotationsToBeRemoved, common.QuarantineHealthEventAnnotationKey) if !r.config.CircuitBreakerEnabled { slog.Info("Circuit breaker is disabled, proceeding with unquarantine action for node", "node", event.NodeName) } - if err := r.config.K8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations( + labelsToRemove := []string{ + r.cordonedByLabelKey, + r.cordonedReasonLabelKey, + r.cordonedTimestampLabelKey, + statemanager.NVSentinelStateLabelKey, + } + + if err := r.k8sClient.UnQuarantineNodeAndRemoveAnnotations( ctx, event.NodeName, taintsToBeRemoved, - isUnCordon, annotationsToBeRemoved, - []string{cordonedByLabelKey, cordonedReasonLabelKey, cordonedTimestampLabelKey, statemanager.NVSentinelStateLabelKey}, + labelsToRemove, labelsMap, ); err != nil { - slog.Error("Error updating node", "event", event, "error", err) - processingErrors.WithLabelValues("untaint_and_uncordon_error").Inc() + slog.Error("Failed to untaint and uncordon node", "node", event.NodeName, "error", err) + metrics.ProcessingErrors.WithLabelValues("untaint_and_uncordon_error").Inc() return true } - r.updateUncordonMetricsAndCache(event.NodeName, taintsToBeRemoved, isUnCordon, annotationsToBeRemoved) + r.updateUncordonMetrics(event.NodeName, taintsToBeRemoved, isUnCordon) return false } @@ -994,7 +1003,6 @@ func (r *Reconciler) prepareUncordonParams( labelsMap = map[string]string{} ) - // Check taints quarantineAnnotationEventTaintsAppliedStr, taintsExists := annotations[common.QuarantineHealthEventAppliedTaintsAnnotationKey] if taintsExists && quarantineAnnotationEventTaintsAppliedStr != "" { @@ -1003,13 +1011,10 @@ func (r *Reconciler) prepareUncordonParams( err := json.Unmarshal([]byte(quarantineAnnotationEventTaintsAppliedStr), &taintsToBeRemoved) if err != nil { - slog.Error("Error unmarshalling taints annotation", - "annotation", quarantineAnnotationEventTaintsAppliedStr, "event", event, "error", err) - return nil, nil, false, nil, fmt.Errorf("failed to unmarshal taints annotation: %w", err) + return nil, nil, false, nil, fmt.Errorf("failed to unmarshal taints annotation for node %s: %w", event.NodeName, err) } } - // Check cordon status quarantineAnnotationEventIsCordonStr, cordonExists := annotations[common.QuarantineHealthEventIsCordonedAnnotationKey] if cordonExists && quarantineAnnotationEventIsCordonStr == common.QuarantineHealthEventIsCordonedAnnotationValueTrue { @@ -1017,74 +1022,33 @@ func (r *Reconciler) prepareUncordonParams( annotationsToBeRemoved = append(annotationsToBeRemoved, common.QuarantineHealthEventIsCordonedAnnotationKey) - labelsMap[uncordonedByLabelKey] = common.ServiceName - labelsMap[uncordonedTimestampLabelKey] = time.Now().UTC().Format("2006-01-02T15-04-05Z") + labelsMap[r.uncordonedByLabelKey] = common.ServiceName + labelsMap[r.uncordonedTimestampLabelKey] = time.Now().UTC().Format("2006-01-02T15-04-05Z") } return taintsToBeRemoved, annotationsToBeRemoved, isUnCordon, labelsMap, nil } -// updateUncordonMetricsAndCache updates metrics and cache after uncordoning -func (r *Reconciler) updateUncordonMetricsAndCache( +func (r *Reconciler) updateUncordonMetrics( nodeName string, taintsToBeRemoved []config.Taint, isUnCordon bool, - annotationsToBeRemoved []string, ) { - totalNodesUnquarantined.WithLabelValues(nodeName).Inc() - currentQuarantinedNodes.WithLabelValues(nodeName).Dec() - slog.Info("Decremented currentQuarantinedNodes metric for unquarantined node", "node", nodeName) - - // Update cache - r.updateCacheWithUnquarantineAnnotations(nodeName, annotationsToBeRemoved) - r.nodeInfo.MarkNodeQuarantineStatusCache(nodeName, false, false) + metrics.TotalNodesUnquarantined.WithLabelValues(nodeName).Inc() + metrics.CurrentQuarantinedNodes.WithLabelValues(nodeName).Set(0) + slog.Info("Set currentQuarantinedNodes to 0 for unquarantined node", "node", nodeName) - // Update taint metrics for _, taint := range taintsToBeRemoved { - taintsRemoved.WithLabelValues(taint.Key, taint.Effect).Inc() + metrics.TaintsRemoved.WithLabelValues(taint.Key, taint.Effect).Inc() } if isUnCordon { - cordonsRemoved.Inc() + metrics.CordonsRemoved.Inc() } } -func (r *Reconciler) updateNodeQuarantineStatus( - ctx context.Context, - healthEventCollection *mongo.Collection, - event bson.M, - nodeQuarantinedStatus *model.Status, -) error { - if nodeQuarantinedStatus == nil { - return fmt.Errorf("nodeQuarantinedStatus is nil") - } - - document, ok := event["fullDocument"].(bson.M) - if !ok { - return fmt.Errorf("error extracting fullDocument from event: %+v", event) - } - - filter := bson.M{"_id": document["_id"]} - - update := bson.M{ - "$set": bson.M{ - "healtheventstatus.nodequarantined": *nodeQuarantinedStatus, - }, - } - - if _, err := healthEventCollection.UpdateOne(ctx, filter, update); err != nil { - return fmt.Errorf("error updating document with _id: %v, error: %w", document["_id"], err) - } - - slog.Info("Document updated", "_id", document["_id"], "nodeQuarantinedStatus", *nodeQuarantinedStatus) - - return nil -} - func formatCordonOrUncordonReasonValue(input string, length int) string { - re := regexp.MustCompile(`[^a-zA-Z0-9_.-]`) - - formatted := re.ReplaceAllString(input, "-") + formatted := labelValueRegex.ReplaceAllString(input, "-") if len(formatted) > length { formatted = formatted[:length] @@ -1096,39 +1060,14 @@ func formatCordonOrUncordonReasonValue(input string, length int) string { return formatted } -// getNodeQuarantineAnnotations retrieves quarantine annotations from cache or API fallback -func (r *Reconciler) getNodeQuarantineAnnotations(ctx context.Context, nodeName string) (map[string]string, error) { - // Try to get annotations from cache first - r.cacheMutex.RLock() - cached, ok := r.nodeAnnotationsCache.Load(nodeName) - r.cacheMutex.RUnlock() - - if ok { - orig := cached.(map[string]string) - // Create a defensive copy to prevent external mutations - dup := make(map[string]string, len(orig)) - for k, v := range orig { - dup[k] = v - } - - slog.Debug("Using cached annotations for node", "node", nodeName) - - return dup, nil - } - - // Fall back to API call if not in cache - return r.fetchAndCacheQuarantineAnnotations(ctx, nodeName) -} - -// fetchAndCacheQuarantineAnnotations fetches all annotations from API and caches only quarantine ones -func (r *Reconciler) fetchAndCacheQuarantineAnnotations(ctx context.Context, - nodeName string) (map[string]string, error) { - allAnnotations, err := r.config.K8sClient.GetNodeAnnotations(ctx, nodeName) +// getNodeQuarantineAnnotations retrieves quarantine annotations from the informer cache +func (r *Reconciler) getNodeQuarantineAnnotations(nodeName string) (map[string]string, error) { + node, err := r.k8sClient.NodeInformer.GetNode(nodeName) if err != nil { - return nil, fmt.Errorf("failed to get node annotations for %s: %w", nodeName, err) + return nil, fmt.Errorf("failed to get node from cache: %w", err) } - // Extract and store only quarantine annotations in cache + // Extract only quarantine annotations quarantineAnnotations := make(map[string]string) quarantineKeys := []string{ common.QuarantineHealthEventAnnotationKey, @@ -1137,229 +1076,68 @@ func (r *Reconciler) fetchAndCacheQuarantineAnnotations(ctx context.Context, common.QuarantinedNodeUncordonedManuallyAnnotationKey, } - for _, key := range quarantineKeys { - if value, exists := allAnnotations[key]; exists { - quarantineAnnotations[key] = value - } - } - - // Store all nodes in cache (even with empty quarantine annotations) - // This prevents repeated API calls for the same node - r.cacheMutex.Lock() - r.nodeAnnotationsCache.Store(nodeName, quarantineAnnotations) - r.cacheMutex.Unlock() - - if len(quarantineAnnotations) > 0 { - slog.Debug("Cached quarantine annotations for node", "node", nodeName) - } - - // Return a defensive copy to prevent external mutations of the cached map - returnCopy := make(map[string]string, len(quarantineAnnotations)) - for k, v := range quarantineAnnotations { - returnCopy[k] = v - } - - return returnCopy, nil -} - -// handleNodeAnnotationChange updates the cached annotations for a node when notified by the informer -func (r *Reconciler) handleNodeAnnotationChange(nodeName string, annotations map[string]string) { - r.cacheMutex.Lock() - defer r.cacheMutex.Unlock() - - if annotations == nil { - // Node was deleted, remove from cache - r.nodeAnnotationsCache.Delete(nodeName) - slog.Debug("Removed annotations from cache for deleted node", "node", nodeName) - - return - } - - // Since we only cache quarantine annotations and the informer only sends quarantine annotations, - // we can simply replace the entire cache entry - // Store all nodes in cache (even with empty quarantine annotations) to prevent API calls - r.nodeAnnotationsCache.Store(nodeName, annotations) - - if len(annotations) > 0 { - slog.Debug("Updated quarantine annotations in cache for node", "node", nodeName) - } else { - slog.Debug("Updated cache for node (no quarantine annotations)", "node", nodeName) - } -} - -// updateCacheWithQuarantineAnnotations updates the cached annotations for a node -// after quarantine annotations have been added to the actual node -func (r *Reconciler) updateCacheWithQuarantineAnnotations(nodeName string, newAnnotations map[string]string) { - r.cacheMutex.Lock() - defer r.cacheMutex.Unlock() - - if cached, ok := r.nodeAnnotationsCache.Load(nodeName); ok { - // Create a copy of the existing cached annotations - annotations := make(map[string]string) - for k, v := range cached.(map[string]string) { - annotations[k] = v - } - - // Add the new quarantine annotations - for key, value := range newAnnotations { - annotations[key] = value - } - - // Update the cache with the modified annotations - r.nodeAnnotationsCache.Store(nodeName, annotations) - slog.Debug("Updated cache", "node", nodeName, "annotations", newAnnotations) - } else { - // If not in cache, store a copy of the new annotations to prevent external mutations - annotationsCopy := make(map[string]string, len(newAnnotations)) - for k, v := range newAnnotations { - annotationsCopy[k] = v + if node.Annotations != nil { + for _, key := range quarantineKeys { + if value, exists := node.Annotations[key]; exists { + quarantineAnnotations[key] = value + } } - - r.nodeAnnotationsCache.Store(nodeName, annotationsCopy) - slog.Debug("Stored new annotations in cache", "node", nodeName, "annotations", newAnnotations) } -} -// updateCacheWithUnquarantineAnnotations updates the cached annotations for a node -// after quarantine annotations have been removed from the actual node -func (r *Reconciler) updateCacheWithUnquarantineAnnotations(nodeName string, removedAnnotationKeys []string) { - r.cacheMutex.Lock() - defer r.cacheMutex.Unlock() - - if cached, ok := r.nodeAnnotationsCache.Load(nodeName); ok { - // Create a copy of the existing cached annotations - annotations := make(map[string]string) - for k, v := range cached.(map[string]string) { - annotations[k] = v - } + slog.Debug("Retrieved quarantine annotations for node from informer cache", "node", nodeName) - // Remove the specified annotation keys - for _, key := range removedAnnotationKeys { - delete(annotations, key) - } - - // Update the cache with the modified annotations - r.nodeAnnotationsCache.Store(nodeName, annotations) - slog.Debug("Updated cache for node %s, removed annotation keys: %v", nodeName, removedAnnotationKeys) - } else { - // If not in cache, nothing to remove - this shouldn't happen in normal flow - slog.Debug("No cache entry found for node during unquarantine annotation update", "node", nodeName) - } + return quarantineAnnotations, nil } -// buildNodeAnnotationsCache fetches all nodes and their annotations to populate the cache -func (r *Reconciler) buildNodeAnnotationsCache(ctx context.Context) error { - slog.Info("Building node annotations cache...") - - startTime := time.Now() - - nodeList, err := r.config.K8sClient.GetK8sClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - if err != nil { - return fmt.Errorf("failed to list nodes: %w", err) - } - - // List of quarantine annotation keys we care about - quarantineKeys := []string{ - common.QuarantineHealthEventAnnotationKey, - common.QuarantineHealthEventAppliedTaintsAnnotationKey, - common.QuarantineHealthEventIsCordonedAnnotationKey, - common.QuarantinedNodeUncordonedManuallyAnnotationKey, - } - - // Use write lock for bulk cache population - r.cacheMutex.Lock() - defer r.cacheMutex.Unlock() - - nodeCount := 0 +func (r *Reconciler) cleanupManualUncordonAnnotation(ctx context.Context, nodeName string, + annotations map[string]string) { + if _, hasManualUncordon := annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey]; hasManualUncordon { + slog.Info("Removing manual uncordon annotation from node before applying new quarantine", "node", nodeName) - for _, node := range nodeList.Items { - // Extract only the quarantine annotations - quarantineAnnotations := make(map[string]string) + updateFn := func(node *corev1.Node) error { + if node.Annotations == nil { + slog.Debug("Node has no annotations, manual uncordon annotation already absent", "node", nodeName) + return nil + } - if node.Annotations != nil { - for _, key := range quarantineKeys { - if value, exists := node.Annotations[key]; exists { - quarantineAnnotations[key] = value - } + if _, exists := node.Annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey]; !exists { + slog.Debug("Manual uncordon annotation already removed from node", "node", nodeName) + return nil } - } - // Store all nodes in cache (even with empty quarantine annotations) - // This prevents API calls for nodes without quarantine annotations - r.nodeAnnotationsCache.Store(node.Name, quarantineAnnotations) + delete(node.Annotations, common.QuarantinedNodeUncordonedManuallyAnnotationKey) - if len(quarantineAnnotations) > 0 { - slog.Debug("Cached quarantine annotations", "node", node.Name, "annotations", quarantineAnnotations) + return nil } - nodeCount++ - } - - fetchDuration := time.Since(startTime) - slog.Info("Successfully built cache with quarantine annotations", "nodeCount", nodeCount, "duration", fetchDuration) - - return nil -} - -// removeManualUncordonAnnotationIfPresent removes the manual uncordon annotation from a node -// if it exists. This is called before applying a new quarantine to ensure clean state. -func (r *Reconciler) removeManualUncordonAnnotationIfPresent(ctx context.Context, nodeName string, - annotations map[string]string) { - if annotations == nil { - return - } - - if _, hasManualUncordon := annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey]; hasManualUncordon { - slog.Info("Removing manual uncordon annotation from node before applying new quarantine", "node", nodeName) - - // Remove the manual uncordon annotation before applying quarantine - if err := r.config.K8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations( - ctx, - nodeName, - nil, // No taints to remove - false, // Not uncordoning - []string{common.QuarantinedNodeUncordonedManuallyAnnotationKey}, // Remove manual uncordon annotation - nil, // No labels to remove - nil, // No labels to add - ); err != nil { - slog.Error("Failed to remove manual uncordon annotation from node", "node", nodeName) - } else { - // Update cache to remove the manual uncordon annotation - r.updateCacheWithUnquarantineAnnotations(nodeName, - []string{common.QuarantinedNodeUncordonedManuallyAnnotationKey}) + if err := r.k8sClient.UpdateNode(ctx, nodeName, updateFn); err != nil { + slog.Error("Failed to remove manual uncordon annotation from node", "node", nodeName, "error", err) } } } // handleManualUncordon handles the case when a node is manually uncordoned while having FQ annotations func (r *Reconciler) handleManualUncordon(nodeName string) error { - ctx := context.Background() - slog.Info("Handling manual uncordon for node", "node", nodeName) - // Get the current annotations from cache or API fallback - annotations, err := r.getNodeQuarantineAnnotations(ctx, nodeName) + annotations, err := r.getNodeQuarantineAnnotations(nodeName) if err != nil { return fmt.Errorf("failed to get annotations for manually uncordoned node %s: %w", nodeName, err) } - // Check which FQ annotations exist and need to be removed annotationsToRemove := []string{} var taintsToRemove []config.Taint - // Check for taints annotation taintsKey := common.QuarantineHealthEventAppliedTaintsAnnotationKey if taintsStr, exists := annotations[taintsKey]; exists && taintsStr != "" { annotationsToRemove = append(annotationsToRemove, taintsKey) - // Parse taints to remove them if err := json.Unmarshal([]byte(taintsStr), &taintsToRemove); err != nil { return fmt.Errorf("failed to unmarshal taints for manually uncordoned node %s: %w", nodeName, err) } } - // Remove all FQ-related annotations if _, exists := annotations[common.QuarantineHealthEventAnnotationKey]; exists { annotationsToRemove = append(annotationsToRemove, common.QuarantineHealthEventAnnotationKey) } @@ -1368,50 +1146,26 @@ func (r *Reconciler) handleManualUncordon(nodeName string) error { annotationsToRemove = append(annotationsToRemove, common.QuarantineHealthEventIsCordonedAnnotationKey) } - // Add the manual uncordon annotation newAnnotations := map[string]string{ common.QuarantinedNodeUncordonedManuallyAnnotationKey: common.QuarantinedNodeUncordonedManuallyAnnotationValue, } - // Update the node: remove FQ annotations and any remaining taints - if err := r.config.K8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations( - ctx, + if err := r.k8sClient.HandleManualUncordonCleanup( + context.Background(), nodeName, taintsToRemove, - false, // Node is already uncordoned manually, so we don't need to uncordon again annotationsToRemove, + newAnnotations, []string{statemanager.NVSentinelStateLabelKey}, - nil, // No labels to add ); err != nil { - processingErrors.WithLabelValues("manual_uncordon_cleanup_error").Inc() - - return fmt.Errorf("failed to clean up annotations for manually uncordoned node %s: %w", nodeName, err) - } + slog.Error("Failed to clean up manually uncordoned node", "node", nodeName, "error", err) + metrics.ProcessingErrors.WithLabelValues("manual_uncordon_cleanup_error").Inc() - // Add the new annotation - if err := r.config.K8sClient.TaintAndCordonNodeAndSetAnnotations( - ctx, - nodeName, - nil, // No taints to add - false, // No cordon to add - newAnnotations, - nil, // No labels to add - ); err != nil { - return fmt.Errorf("failed to add manual uncordon annotation to node %s: %w", nodeName, err) + return err } - currentQuarantinedNodes.WithLabelValues(nodeName).Dec() - slog.Info("Decremented currentQuarantinedNodes metric for manually uncordoned node", "node", nodeName) - - // Update internal state immediately to be consistent with the metric. - // This ensures the state is correct even before the subsequent update event is processed. - // Note: The subsequent update event will call updateNodeQuarantineStatus, but it won't - // actually update the cache since we've already set it to the correct state here. - r.nodeInfo.MarkNodeQuarantineStatusCache(nodeName, false, false) - - // Note: We don't need to manually update the annotation cache here because - // after we update the node, it will trigger another update event in the NodeInformer - // which will call onNodeAnnotationsChanged to update the cache + metrics.CurrentQuarantinedNodes.WithLabelValues(nodeName).Set(0) + slog.Info("Set currentQuarantinedNodes to 0 for manually uncordoned node", "node", nodeName) slog.Info("Successfully handled manual uncordon for node", "node", nodeName) diff --git a/fault-quarantine-module/pkg/reconciler/reconciler_e2e_test.go b/fault-quarantine-module/pkg/reconciler/reconciler_e2e_test.go new file mode 100644 index 000000000..69e1ac70f --- /dev/null +++ b/fault-quarantine-module/pkg/reconciler/reconciler_e2e_test.go @@ -0,0 +1,3302 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package reconciler + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "sync" + "testing" + "time" + + "github.com/nvidia/nvsentinel/commons/pkg/statemanager" + "github.com/nvidia/nvsentinel/data-models/pkg/model" + "github.com/nvidia/nvsentinel/data-models/pkg/protos" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/breaker" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/evaluator" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/healthEventsAnnotation" + "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/informer" + storeclientsdk "github.com/nvidia/nvsentinel/store-client-sdk/pkg/storewatcher" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/bson/primitive" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/envtest" +) + +var ( + e2eTestClient *kubernetes.Clientset + e2eTestContext context.Context + e2eTestCancelFunc context.CancelFunc + e2eTestEnv *envtest.Environment +) + +var ( + quarantineHealthEventAnnotationKey = common.QuarantineHealthEventAnnotationKey + quarantineHealthEventAppliedTaintsAnnotationKey = common.QuarantineHealthEventAppliedTaintsAnnotationKey + quarantineHealthEventIsCordonedAnnotationKey = common.QuarantineHealthEventIsCordonedAnnotationKey +) + +const ( + eventuallyTimeout = 10 * time.Second + eventuallyPollInterval = 200 * time.Millisecond + + statusCheckTimeout = 5 * time.Second + statusCheckPollInterval = 100 * time.Millisecond + + neverTimeout = 1 * time.Second + neverPollInterval = 100 * time.Millisecond +) + +func TestMain(m *testing.M) { + var err error + e2eTestContext, e2eTestCancelFunc = context.WithCancel(context.Background()) + + e2eTestEnv = &envtest.Environment{} + + e2eTestRestConfig, err := e2eTestEnv.Start() + if err != nil { + log.Fatalf("Failed to start test environment: %v", err) + } + + e2eTestClient, err = kubernetes.NewForConfig(e2eTestRestConfig) + if err != nil { + log.Fatalf("Failed to create kubernetes client: %v", err) + } + + exitCode := m.Run() + + e2eTestCancelFunc() + if err := e2eTestEnv.Stop(); err != nil { + log.Fatalf("Failed to stop test environment: %v", err) + } + os.Exit(exitCode) +} + +func createE2ETestNode(ctx context.Context, t *testing.T, name string, annotations map[string]string, labels map[string]string, taints []corev1.Taint, unschedulable bool) { + t.Helper() + + if labels == nil { + labels = make(map[string]string) + } + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Annotations: annotations, + Labels: labels, + }, + Spec: corev1.NodeSpec{ + Unschedulable: unschedulable, + Taints: taints, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }, + }, + } + + _, err := e2eTestClient.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) + require.NoError(t, err, "Failed to create test node %s", name) +} + +func createHealthEventBSON(eventID primitive.ObjectID, nodeName, checkName string, isHealthy, isFatal bool, entities []*protos.Entity, quarantineStatus model.Status) bson.M { + entitiesBSON := []interface{}{} + for _, entity := range entities { + entitiesBSON = append(entitiesBSON, bson.M{ + "entitytype": entity.EntityType, + "entityvalue": entity.EntityValue, + }) + } + + return bson.M{ + "operationType": "insert", + "fullDocument": bson.M{ + "_id": eventID, + "healtheventstatus": bson.M{ + "nodequarantined": quarantineStatus, + }, + "healthevent": bson.M{ + "nodename": nodeName, + "agent": "gpu-health-monitor", + "componentclass": "GPU", + "checkname": checkName, + "version": uint32(1), + "ishealthy": isHealthy, + "isfatal": isFatal, + "entitiesimpacted": entitiesBSON, + }, + }, + } +} + +type StatusGetter func(eventID primitive.ObjectID) *model.Status + +// E2EReconcilerConfig holds configuration options for test reconciler setup +type E2EReconcilerConfig struct { + TomlConfig config.TomlConfig + CircuitBreakerConfig *breaker.CircuitBreakerConfig + DryRun bool +} + +// setupE2EReconciler creates a test reconciler with mock watcher +// Returns: (reconciler, mockWatcher, statusGetter, circuitBreaker) +// Note: circuitBreaker will be nil when cbConfig is nil (circuit breaker disabled) +func setupE2EReconciler(t *testing.T, ctx context.Context, tomlConfig config.TomlConfig, cbConfig *breaker.CircuitBreakerConfig) (*Reconciler, *storeclientsdk.FakeChangeStreamWatcher, StatusGetter, breaker.CircuitBreaker) { + t.Helper() + return setupE2EReconcilerWithOptions(t, ctx, E2EReconcilerConfig{ + TomlConfig: tomlConfig, + CircuitBreakerConfig: cbConfig, + DryRun: false, + }) +} + +// setupE2EReconcilerWithOptions creates a test reconciler with full configuration control +// Returns: (reconciler, mockWatcher, statusGetter, circuitBreaker) +// Note: circuitBreaker will be nil when cbConfig is nil (circuit breaker disabled) +func setupE2EReconcilerWithOptions(t *testing.T, ctx context.Context, cfg E2EReconcilerConfig) (*Reconciler, *storeclientsdk.FakeChangeStreamWatcher, StatusGetter, breaker.CircuitBreaker) { + t.Helper() + + nodeInformer, err := informer.NewNodeInformer(e2eTestClient, 0) + require.NoError(t, err) + + fqClient := &informer.FaultQuarantineClient{ + Clientset: e2eTestClient, + DryRunMode: cfg.DryRun, + NodeInformer: nodeInformer, + } + + stopCh := make(chan struct{}) + t.Cleanup(func() { close(stopCh) }) + + go nodeInformer.Run(stopCh) + + require.Eventually(t, nodeInformer.HasSynced, eventuallyTimeout, statusCheckPollInterval, "NodeInformer should sync") + + ruleSetEvals, err := evaluator.InitializeRuleSetEvaluators(cfg.TomlConfig.RuleSets, fqClient.NodeInformer) + require.NoError(t, err) + + var cb breaker.CircuitBreaker + if cfg.CircuitBreakerConfig != nil { + cbConfig := cfg.CircuitBreakerConfig + // Set defaults if not provided + percentage := cbConfig.Percentage + if percentage == 0 { + percentage = 50 + } + duration := cbConfig.Duration + if duration == 0 { + duration = 5 * time.Minute + } + namespace := cbConfig.Namespace + if namespace == "" { + namespace = "default" + } + name := cbConfig.Name + if name == "" { + name = "test-cb-" + primitive.NewObjectID().Hex()[:8] + } + + cb, err = breaker.NewSlidingWindowBreaker(ctx, breaker.Config{ + Window: duration, + TripPercentage: float64(percentage), + K8sClient: fqClient, + ConfigMapName: name, + ConfigMapNamespace: namespace, + }) + require.NoError(t, err, "Failed to create circuit breaker") + } + + reconcilerCfg := ReconcilerConfig{ + TomlConfig: cfg.TomlConfig, + CircuitBreakerEnabled: cfg.CircuitBreakerConfig != nil, + DryRun: cfg.DryRun, + } + + r := NewReconciler(reconcilerCfg, fqClient, cb) + + if cfg.TomlConfig.LabelPrefix != "" { + r.SetLabelKeys(cfg.TomlConfig.LabelPrefix) + fqClient.SetLabelKeys(r.cordonedReasonLabelKey, r.uncordonedReasonLabelKey) + } + + // Build rulesets config (mimics reconciler.Start()) + rulesetsConfig := rulesetsConfig{ + TaintConfigMap: make(map[string]*config.Taint), + CordonConfigMap: make(map[string]bool), + RuleSetPriorityMap: make(map[string]int), + } + + for _, ruleSet := range cfg.TomlConfig.RuleSets { + if ruleSet.Taint.Key != "" { + rulesetsConfig.TaintConfigMap[ruleSet.Name] = &ruleSet.Taint + } + if ruleSet.Cordon.ShouldCordon { + rulesetsConfig.CordonConfigMap[ruleSet.Name] = true + } + if ruleSet.Priority > 0 { + rulesetsConfig.RuleSetPriorityMap[ruleSet.Name] = ruleSet.Priority + } + } + + r.precomputeTaintInitKeys(ruleSetEvals, rulesetsConfig) + + // Setup manual uncordon callback + fqClient.NodeInformer.SetOnManualUncordonCallback(r.handleManualUncordon) + + // Create mock watcher + mockWatcher := storeclientsdk.NewFakeChangeStreamWatcher() + + // Ensure the event channel is closed when test completes to terminate the processing goroutine + t.Cleanup(func() { + close(mockWatcher.EventsChan) + }) + + // Store event statuses for verification (mimics MongoDB status updates) + var statusMu sync.Mutex + eventStatuses := make(map[primitive.ObjectID]*model.Status) + + // Setup the reconciler with the callback (mimics Start()) + processEventFunc := func(ctx context.Context, event *model.HealthEventWithStatus) *model.Status { + return r.ProcessEvent(ctx, event, ruleSetEvals, rulesetsConfig) + } + + // Start event processing goroutine (mimics production event watcher) + go func() { + for event := range mockWatcher.Events() { + healthEventWithStatus := model.HealthEventWithStatus{} + if err := storeclientsdk.UnmarshalFullDocumentFromEvent(event, &healthEventWithStatus); err != nil { + continue + } + + // Get event ID (mimics MongoDB _id) + var eventID primitive.ObjectID + if fullDoc, ok := event["fullDocument"].(bson.M); ok { + if id, ok := fullDoc["_id"].(primitive.ObjectID); ok { + eventID = id + } + } + + // Process event and store status (mimics updateNodeQuarantineStatus in production) + status := processEventFunc(ctx, &healthEventWithStatus) + + statusMu.Lock() + eventStatuses[eventID] = status + statusMu.Unlock() + } + }() + + // Return status getter for tests + getStatus := func(eventID primitive.ObjectID) *model.Status { + statusMu.Lock() + defer statusMu.Unlock() + return eventStatuses[eventID] + } + + return r, mockWatcher, getStatus, cb +} + +func verifyHealthEventInAnnotation(t *testing.T, node *corev1.Node, expectedCheckName, expectedAgent, expectedComponentClass string, expectedEntityType, expectedEntityValue string) { + t.Helper() + + annotationStr := node.Annotations[quarantineHealthEventAnnotationKey] + require.NotEmpty(t, annotationStr, "Quarantine annotation should exist") + + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + err := json.Unmarshal([]byte(annotationStr), &healthEventsMap) + require.NoError(t, err, "Should unmarshal annotation") + + queryEvent := &protos.HealthEvent{ + Agent: expectedAgent, + ComponentClass: expectedComponentClass, + CheckName: expectedCheckName, + NodeName: node.Name, + Version: 1, + EntitiesImpacted: []*protos.Entity{ + {EntityType: expectedEntityType, EntityValue: expectedEntityValue}, + }, + } + + storedEvent, found := healthEventsMap.GetEvent(queryEvent) + require.True(t, found, "Expected entity should be found in annotation") + require.NotNil(t, storedEvent, "Stored event should not be nil") + assert.Equal(t, expectedCheckName, storedEvent.CheckName, "Check name should match") + assert.Equal(t, expectedAgent, storedEvent.Agent, "Agent should match") + assert.Equal(t, expectedComponentClass, storedEvent.ComponentClass, "Component class should match") +} + +func verifyAppliedTaintsAnnotation(t *testing.T, node *corev1.Node, expectedTaints []config.Taint) { + t.Helper() + + taintsAnnotationStr := node.Annotations[quarantineHealthEventAppliedTaintsAnnotationKey] + require.NotEmpty(t, taintsAnnotationStr, "Applied taints annotation should exist") + + var appliedTaints []config.Taint + err := json.Unmarshal([]byte(taintsAnnotationStr), &appliedTaints) + require.NoError(t, err, "Should unmarshal taints annotation") + + assert.Len(t, appliedTaints, len(expectedTaints), "Should have expected number of taints") + + for _, expectedTaint := range expectedTaints { + found := false + for _, appliedTaint := range appliedTaints { + if appliedTaint.Key == expectedTaint.Key && + appliedTaint.Value == expectedTaint.Value && + appliedTaint.Effect == expectedTaint.Effect { + found = true + break + } + } + assert.True(t, found, "Expected taint %+v should be in applied taints annotation", expectedTaint) + } +} + +func verifyNodeTaintsMatch(t *testing.T, node *corev1.Node, expectedTaints []config.Taint) { + t.Helper() + + for _, expectedTaint := range expectedTaints { + found := false + for _, nodeTaint := range node.Spec.Taints { + if nodeTaint.Key == expectedTaint.Key && + nodeTaint.Value == expectedTaint.Value && + string(nodeTaint.Effect) == expectedTaint.Effect { + found = true + break + } + } + assert.True(t, found, "Expected taint %+v should be on node", expectedTaint) + } +} + +func verifyQuarantineLabels(t *testing.T, node *corev1.Node, expectedCordonReason string) { + t.Helper() + + assert.Equal(t, common.ServiceName, node.Labels["k8s.nvidia.com/cordon-by"], "cordon-by label should be set") + assert.Contains(t, node.Labels["k8s.nvidia.com/cordon-reason"], expectedCordonReason, "cordon-reason should contain expected value") + assert.NotEmpty(t, node.Labels["k8s.nvidia.com/cordon-timestamp"], "cordon-timestamp should be set") + assert.Equal(t, string(statemanager.QuarantinedLabelValue), node.Labels[statemanager.NVSentinelStateLabelKey], "nvsentinel-state should be quarantined") +} + +func verifyUnquarantineLabels(t *testing.T, node *corev1.Node) { + t.Helper() + + assert.Equal(t, common.ServiceName, node.Labels["k8s.nvidia.com/uncordon-by"], "uncordon-by label should be set") + assert.NotEmpty(t, node.Labels["k8s.nvidia.com/uncordon-timestamp"], "uncordon-timestamp should be set") + assert.NotContains(t, node.Labels, "k8s.nvidia.com/cordon-by", "cordon-by label should be removed") + assert.NotContains(t, node.Labels, "k8s.nvidia.com/cordon-reason", "cordon-reason label should be removed") + assert.NotContains(t, node.Labels, "k8s.nvidia.com/cordon-timestamp", "cordon-timestamp label should be removed") + assert.NotContains(t, node.Labels, statemanager.NVSentinelStateLabelKey, "nvsentinel-state label should be removed") +} + +func TestE2E_BasicQuarantineAndUnquarantine(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-basic-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Sending unhealthy event for initial quarantine") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Waiting for node to be quarantined") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable && node.Annotations[common.QuarantineHealthEventAnnotationKey] != "" + }, eventuallyTimeout, eventuallyPollInterval, "Node should be quarantined") + + t.Log("Verify complete quarantine state with actual annotation content") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "0") + + t.Log("Verify applied taints annotation content") + expectedTaints := []config.Taint{ + {Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + } + verifyAppliedTaintsAnnotation(t, node, expectedTaints) + verifyNodeTaintsMatch(t, node, expectedTaints) + assert.Equal(t, "True", node.Annotations[quarantineHealthEventIsCordonedAnnotationKey], "Cordon annotation should be True") + verifyQuarantineLabels(t, node, "gpu-xid-critical-errors") + + t.Log("Sending healthy event for unquarantine") + eventID2 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID2, + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Waiting for UnQuarantined status") + require.Eventually(t, func() bool { + status := getStatus(eventID2) + return status != nil && *status == model.UnQuarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be UnQuarantined") + + t.Log("Waiting for node to be unquarantined") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return !node.Spec.Unschedulable && node.Annotations[common.QuarantineHealthEventAnnotationKey] == "" + }, eventuallyTimeout, eventuallyPollInterval, "Node should be unquarantined") + + t.Log("Verify complete unquarantine state") + node, err = e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + fqTaintCount := 0 + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-xid-error" { + fqTaintCount++ + } + } + assert.Equal(t, 0, fqTaintCount, "FQ taints should be removed") + assert.Empty(t, node.Annotations[quarantineHealthEventAnnotationKey], "Quarantine annotation should be removed") + assert.Empty(t, node.Annotations[quarantineHealthEventAppliedTaintsAnnotationKey], "Applied taints annotation should be removed") + assert.Empty(t, node.Annotations[quarantineHealthEventIsCordonedAnnotationKey], "Cordoned annotation should be removed") + verifyUnquarantineLabels(t, node) +} + +func TestE2E_EntityLevelTracking(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeName := "e2e-entity-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("GPU 0 fails - initial quarantine") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is Quarantined for first failure") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined") + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be quarantined") + + t.Log("GPU 1 fails - testing entity-level tracking") + eventID2 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID2, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is AlreadyQuarantined for second failure") + require.Eventually(t, func() bool { + status := getStatus(eventID2) + return status != nil && *status == model.AlreadyQuarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be AlreadyQuarantined") + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == 2 + }, eventuallyTimeout, eventuallyPollInterval, "Should track 2 GPUs") + + t.Log("Verify actual annotation content for both entities") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "0") + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "1") + + t.Log("GPU 0 recovers - node should stay quarantined (GPU 1 still failing)") + eventID3 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID3, + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is AlreadyQuarantined (partial recovery, node stays quarantined)") + require.Eventually(t, func() bool { + status := getStatus(eventID3) + return status != nil && *status == model.AlreadyQuarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be AlreadyQuarantined for partial recovery") + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return node.Spec.Unschedulable && healthEventsMap.Count() == 1 + }, eventuallyTimeout, eventuallyPollInterval, "Should remove GPU 0, keep quarantined") + + t.Log("Verify GPU 1 is still in annotation, GPU 0 is not") + node, err = e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "1") + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + err = json.Unmarshal([]byte(node.Annotations[quarantineHealthEventAnnotationKey]), &healthEventsMap) + require.NoError(t, err) + gpu0Query := &protos.HealthEvent{ + Agent: "gpu-health-monitor", + ComponentClass: "GPU", + CheckName: "GpuXidError", + NodeName: nodeName, + Version: 1, + EntitiesImpacted: []*protos.Entity{ + {EntityType: "GPU", EntityValue: "0"}, + }, + } + _, found := healthEventsMap.GetEvent(gpu0Query) + assert.False(t, found, "GPU 0 should NOT be in annotation after recovery") + + t.Log("GPU 1 recovers - node should be fully unquarantined") + eventID4 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID4, + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is UnQuarantined (complete recovery)") + require.Eventually(t, func() bool { + status := getStatus(eventID4) + return status != nil && *status == model.UnQuarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be UnQuarantined") + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return !node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be unquarantined") +} + +func TestE2E_MultipleChecksOnSameNode(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeName := "e2e-multicheck-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + { + Name: "gpu-nvlink-errors", + Version: "1", + Priority: 8, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuNvLinkWatch' && event.isHealthy == false"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-nvlink-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("XID Error on GPU 0") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval) + + t.Log("NVLink Error on GPU 1") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuNvLinkWatch", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == 2 && node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Should track both XID and NVLink entities") + + t.Log("Verify actual content for both checks/entities") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "0") + verifyHealthEventInAnnotation(t, node, "GpuNvLinkWatch", "gpu-health-monitor", "GPU", "GPU", "1") + + t.Log("XID recovers - node stays quarantined (NVLink still failing)") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == 1 && node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "XID entity removed, NVLink remains, still quarantined") + + t.Log("NVLink recovers") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuNvLinkWatch", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return !node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be unquarantined") +} + +func TestE2E_CheckLevelHealthyEvent(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeName := "e2e-checklevel-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Quarantine with multiple entities") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{ + {EntityType: "GPU", EntityValue: "0"}, + {EntityType: "GPU", EntityValue: "1"}, + }, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return node.Spec.Unschedulable && healthEventsMap.Count() == 2 + }, eventuallyTimeout, eventuallyPollInterval, "Should track 2 entities") + + t.Log("Check-level healthy event (empty entities) - should clear ALL entities for this check") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{}, // Empty - means all entities healthy + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return !node.Spec.Unschedulable && node.Annotations[common.QuarantineHealthEventAnnotationKey] == "" + }, eventuallyTimeout, eventuallyPollInterval, "Check-level healthy event should clear all entities and unquarantine") +} + +func TestE2E_DuplicateEntityEvents(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-duplicate-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("First failure on GPU 0") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval) + + // Get initial annotation before duplicate event + initialNode, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + initialAnnotation := initialNode.Annotations[common.QuarantineHealthEventAnnotationKey] + + t.Log("Duplicate failure on same GPU 0 - should not duplicate entity") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Use Never to verify annotation doesn't change for duplicate + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + currentAnnotation := node.Annotations[common.QuarantineHealthEventAnnotationKey] + return currentAnnotation != initialAnnotation + }, neverTimeout, neverPollInterval, "Duplicate entity should not change annotation") + + // Final verification + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + err = json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap) + require.NoError(t, err) + assert.Equal(t, 1, healthEventsMap.Count(), "Duplicate entity should not be added") +} + +func TestE2E_HealthyEventWithoutQuarantine(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-healthy-noq-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send healthy event without any prior quarantine") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is nil (healthy event without prior quarantine is skipped)") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status == nil + }, statusCheckTimeout, statusCheckPollInterval, "Status should be nil for skipped event") + + t.Log("Verify node stays unquarantined") + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Node should not be quarantined") + + t.Log("Verify final state - no quarantine annotations") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.Empty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey]) +} + +func TestE2E_PartialEntityRecovery(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeName := "e2e-partial-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Fail GPUs 0, 1, 2 (send sequentially to avoid race conditions)") + for i := 0; i < 3; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: fmt.Sprintf("%d", i)}}, + model.StatusInProgress, + ) + + // Wait for this GPU to be tracked before sending next event + expectedCount := i + 1 + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == expectedCount + }, statusCheckTimeout, statusCheckPollInterval, "Should track %d GPU(s)", expectedCount) + } + + t.Log("Recover GPU 1 only") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == 2 && node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Should remove GPU 1, keep node quarantined with GPU 0 and GPU 2") +} + +func TestE2E_AllGPUsFailThenRecover(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 40*time.Second) + defer cancel() + + nodeName := "e2e-allgpu-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + numGPUs := 8 + + t.Log("All GPUs fail (send sequentially to avoid race conditions)") + for i := 0; i < numGPUs; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: fmt.Sprintf("%d", i)}}, + model.StatusInProgress, + ) + + // Wait for this GPU to be tracked before sending next event + expectedCount := i + 1 + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == expectedCount + }, statusCheckTimeout, statusCheckPollInterval, "Should track %d GPU(s)", expectedCount) + } + + t.Log("All GPUs recover") + for i := 0; i < numGPUs; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: fmt.Sprintf("%d", i)}}, + model.StatusInProgress, + ) + } + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return !node.Spec.Unschedulable && node.Annotations[common.QuarantineHealthEventAnnotationKey] == "" + }, eventuallyTimeout, eventuallyPollInterval, "All GPUs recovered, node should be unquarantined") +} + +func TestE2E_SyslogMultipleEntityTypes(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeName := "e2e-syslog-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "syslog-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'SysLogsXIDError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/syslog-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Syslog pattern: single event with multiple entity types (PCI + GPUID)") + mockWatcher.EventsChan <- bson.M{ + "operationType": "insert", + "fullDocument": bson.M{ + "_id": primitive.NewObjectID(), + "healtheventstatus": bson.M{ + "nodequarantined": model.StatusInProgress, + }, + "healthevent": bson.M{ + "nodename": nodeName, + "agent": "syslog-health-monitor", + "componentclass": "GPU", + "checkname": "SysLogsXIDError", + "version": uint32(1), + "ishealthy": false, + "isfatal": true, + "errorcode": []string{"79"}, + "entitiesimpacted": []interface{}{ + bson.M{"entitytype": "PCI", "entityvalue": "0000:b4:00"}, + bson.M{"entitytype": "GPUID", "entityvalue": "GPU-0b32a29e-0c94-cd1a-d44a-4e3ea8b2e3fc"}, + }, + }, + }, + } + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return node.Spec.Unschedulable && healthEventsMap.Count() == 2 + }, eventuallyTimeout, eventuallyPollInterval, "Should track both PCI and GPUID entities") + + t.Log("Verify actual annotation content for both entity types") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + verifyHealthEventInAnnotation(t, node, "SysLogsXIDError", "syslog-health-monitor", "GPU", "PCI", "0000:b4:00") + verifyHealthEventInAnnotation(t, node, "SysLogsXIDError", "syslog-health-monitor", "GPU", "GPUID", "GPU-0b32a29e-0c94-cd1a-d44a-4e3ea8b2e3fc") + + t.Log("Check-level healthy event (empty entities) should clear BOTH PCI and GPUID") + mockWatcher.EventsChan <- bson.M{ + "operationType": "insert", + "fullDocument": bson.M{ + "_id": primitive.NewObjectID(), + "healtheventstatus": bson.M{ + "nodequarantined": model.StatusInProgress, + }, + "healthevent": bson.M{ + "nodename": nodeName, + "agent": "syslog-health-monitor", + "componentclass": "GPU", + "checkname": "SysLogsXIDError", + "version": uint32(1), + "ishealthy": true, + "message": "No Health Failures", + "entitiesimpacted": []interface{}{}, // Empty + }, + }, + } + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return !node.Spec.Unschedulable && node.Annotations[common.QuarantineHealthEventAnnotationKey] == "" + }, eventuallyTimeout, eventuallyPollInterval, "Check-level healthy event should clear all entity types") +} + +func TestE2E_ManualUncordon(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-manual-uncordon-" + primitive.NewObjectID().Hex()[:8] + + annotations := map[string]string{ + common.QuarantineHealthEventAnnotationKey: `[{"nodeName":"` + nodeName + `","agent":"test","checkName":"test","isHealthy":false,"entitiesImpacted":[{"entityType":"GPU","entityValue":"0"}]}]`, + common.QuarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"nvidia.com/gpu-error","Value":"true","Effect":"NoSchedule"}]`, + common.QuarantineHealthEventIsCordonedAnnotationKey: common.QuarantineHealthEventIsCordonedAnnotationValueTrue, + } + + labels := map[string]string{ + statemanager.NVSentinelStateLabelKey: string(statemanager.QuarantinedLabelValue), + } + + taints := []corev1.Taint{ + {Key: "nvidia.com/gpu-error", Value: "true", Effect: "NoSchedule"}, + } + + createE2ETestNode(ctx, t, nodeName, annotations, labels, taints, true) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + } + + // Setup reconciler to watch for manual uncordon events + // The node informer callbacks are registered during setup and will detect the manual uncordon + setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Manually uncordon the node") + quarantinedNode, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + quarantinedNode.Spec.Unschedulable = false + _, err = e2eTestClient.CoreV1().Nodes().Update(ctx, quarantinedNode, metav1.UpdateOptions{}) + require.NoError(t, err) + + t.Log("Verify manual uncordon is detected and FQ state cleaned up") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + if _, exists := node.Annotations[common.QuarantineHealthEventAnnotationKey]; exists { + return false + } + + if node.Annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey] != common.QuarantinedNodeUncordonedManuallyAnnotationValue { + return false + } + + fqTaintCount := 0 + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-error" { + fqTaintCount++ + } + } + + return fqTaintCount == 0 + }, eventuallyTimeout, eventuallyPollInterval, "Manual uncordon should clean up FQ state") +} + +func TestE2E_BackwardCompatibilityOldFormat(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-backward-" + primitive.NewObjectID().Hex()[:8] + + // Old format: single HealthEvent object (not array) + existingOldEvent := &protos.HealthEvent{ + NodeName: nodeName, + Agent: "gpu-health-monitor", + ComponentClass: "GPU", + CheckName: "GpuXidError", + Version: 1, + IsHealthy: false, + IsFatal: true, + EntitiesImpacted: []*protos.Entity{ + {EntityType: "GPU", EntityValue: "0"}, + }, + } + + oldAnnotationBytes, err := json.Marshal(existingOldEvent) + require.NoError(t, err) + + annotations := map[string]string{ + common.QuarantineHealthEventAnnotationKey: string(oldAnnotationBytes), + common.QuarantineHealthEventIsCordonedAnnotationKey: "True", + common.QuarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"nvidia.com/gpu-xid-error","Value":"true","Effect":"NoSchedule"}]`, + } + + taints := []corev1.Taint{ + {Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + } + + createE2ETestNode(ctx, t, nodeName, annotations, nil, taints, true) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-nvlink-errors", + Version: "1", + Priority: 8, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuNvLinkWatch'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-nvlink-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Add new event for different check/entity") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuNvLinkWatch", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + // Should convert to new format and append + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == 2 + }, eventuallyTimeout, eventuallyPollInterval, "Should convert old format and add new event") + + t.Log("Recover the old event") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == 1 && node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Old event removed, new event remains") + + t.Log("Recover the new event") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuNvLinkWatch", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return !node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be unquarantined") +} + +func TestE2E_MixedHealthyUnhealthyFlapping(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeName := "e2e-flapping-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Flapping GPU scenario: alternating unhealthy and healthy events") + for cycle := 0; cycle < 3; cycle++ { + // Unhealthy + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, statusCheckTimeout, statusCheckPollInterval, "Should be quarantined") + + // Healthy + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return !node.Spec.Unschedulable + }, statusCheckTimeout, statusCheckPollInterval, "Should be unquarantined") + } + + t.Log("Verify final state should be healthy") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.False(t, node.Spec.Unschedulable) + assert.Empty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey]) +} + +func TestE2E_MultipleNodesSimultaneous(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeNames := []string{ + "e2e-multi-1-" + primitive.NewObjectID().Hex()[:6], + "e2e-multi-2-" + primitive.NewObjectID().Hex()[:6], + "e2e-multi-3-" + primitive.NewObjectID().Hex()[:6], + } + + for _, nodeName := range nodeNames { + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func(name string) { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, name, metav1.DeleteOptions{}) + }(nodeName) + } + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send failure events for all nodes") + for _, nodeName := range nodeNames { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + } + + // Verify all nodes are quarantined + for _, nodeName := range nodeNames { + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node %s should be quarantined", nodeName) + } + + t.Log("Verify all have proper annotations and taints") + for _, nodeName := range nodeNames { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.Contains(t, node.Annotations, common.QuarantineHealthEventAnnotationKey) + hasTaint := false + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-xid-error" { + hasTaint = true + break + } + } + assert.True(t, hasTaint, "Node %s should have FQ taint", nodeName) + } +} + +func TestE2E_HealthyEventForNonMatchingCheck(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-nomatch-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Quarantine with XID error") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval) + + t.Log("Send healthy event for DIFFERENT check that was never failing") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuNvLinkWatch", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Node should remain quarantined (XID error still active, healthy NVLink event doesn't unquarantine) + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return !node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Node should remain quarantined") + + t.Log("Verify XID error still tracked") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + err = json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap) + require.NoError(t, err) + assert.Equal(t, 1, healthEventsMap.Count(), "Should still have XID error tracked") +} + +func TestE2E_MultipleRulesetsWithPriorities(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-priorities-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "low-priority-rule", + Version: "1", + Priority: 5, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-error", Value: "low", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: false}, + }, + { + Name: "high-priority-rule", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-error", Value: "high", Effect: "NoExecute"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + // Should use higher priority effect (NoExecute) + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-error" && taint.Value == "high" && string(taint.Effect) == "NoExecute" { + return node.Spec.Unschedulable + } + } + + return false + }, eventuallyTimeout, eventuallyPollInterval, "Should use higher priority taint effect") +} + +func TestE2E_NonFatalEventDoesNotQuarantine(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-nonfatal-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send non-fatal XID error (isFatal=false) - rule requires isFatal=true") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + false, // Not fatal + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Verify node is never quarantined (rule doesn't match) + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Non-fatal event should not quarantine") + + t.Log("Verify no quarantine annotations") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.Empty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey]) +} + +func TestE2E_OutOfOrderEvents(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-outoforder-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send healthy event BEFORE unhealthy event (out of order)") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Verify node is never quarantined (healthy event without prior quarantine is skipped) + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Healthy event before unhealthy should not quarantine") + + t.Log("Now send unhealthy event") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Unhealthy event should quarantine") +} + +func TestE2E_SkipRedundantCordoning(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-redundant-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-critical-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("First check quarantines node") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be quarantined") + + t.Log("Different check on already cordoned node - should skip redundant cordoning") + initialCordonState := true + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuMemWatch", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + // Verify node remains cordoned (doesn't uncordon) + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable != initialCordonState + }, neverTimeout, neverPollInterval, "Node cordon state should not change") +} + +func TestE2E_NodeAlreadyCordonedManually(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-manual-cordon-" + primitive.NewObjectID().Hex()[:8] + + // Create node that's already manually cordoned (no FQ annotations) + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, true) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send unhealthy event - FQM should apply taints/annotations to manually cordoned node") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Verify FQM adds taints and annotations to manually cordoned node + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + hasTaint := false + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-xid-error" { + hasTaint = true + break + } + } + + return node.Spec.Unschedulable && + hasTaint && + node.Annotations[common.QuarantineHealthEventAnnotationKey] != "" + }, eventuallyTimeout, eventuallyPollInterval, "FQM should add taints/annotations to manually cordoned node") + + t.Log("Verify actual annotation content and taints") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "0") + expectedTaints := []config.Taint{ + {Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + } + verifyAppliedTaintsAnnotation(t, node, expectedTaints) + verifyNodeTaintsMatch(t, node, expectedTaints) +} + +func TestE2E_NodeAlreadyQuarantinedStillUnhealthy(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-already-q-unhealthy-" + primitive.NewObjectID().Hex()[:8] + + // Create node already quarantined by FQM + existingEvent := &protos.HealthEvent{ + NodeName: nodeName, + Agent: "agent1", + CheckName: "checkA", + ComponentClass: "GPU", + Version: 1, + IsHealthy: false, + EntitiesImpacted: []*protos.Entity{ + {EntityType: "GPU", EntityValue: "0"}, + }, + } + + existingMap := healthEventsAnnotation.NewHealthEventsAnnotationMap() + existingMap.AddOrUpdateEvent(existingEvent) + existingBytes, err := json.Marshal(existingMap) + require.NoError(t, err) + + annotations := map[string]string{ + common.QuarantineHealthEventAnnotationKey: string(existingBytes), + common.QuarantineHealthEventIsCordonedAnnotationKey: "True", + } + + createE2ETestNode(ctx, t, nodeName, annotations, nil, nil, true) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send another unhealthy event for same entity - should remain quarantined") + mockWatcher.EventsChan <- bson.M{ + "operationType": "insert", + "fullDocument": bson.M{ + "_id": primitive.NewObjectID(), + "healtheventstatus": bson.M{ + "nodequarantined": model.StatusInProgress, + }, + "healthevent": bson.M{ + "nodename": nodeName, + "agent": "agent1", + "componentclass": "GPU", + "checkname": "checkA", + "version": uint32(1), + "ishealthy": false, + "entitiesimpacted": []interface{}{ + bson.M{"entitytype": "GPU", "entityvalue": "0"}, + }, + }, + }, + } + + // Verify node never unquarantines (remains quarantined with same entity) + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return !node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Node should remain quarantined") + + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.NotEmpty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey]) +} + +func TestE2E_NodeAlreadyQuarantinedBecomesHealthy(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-already-q-healthy-" + primitive.NewObjectID().Hex()[:8] + + // Create node already quarantined by FQM + existingEvent := &protos.HealthEvent{ + NodeName: nodeName, + Agent: "agent1", + CheckName: "checkA", + ComponentClass: "GPU", + Version: 1, + IsHealthy: false, + EntitiesImpacted: []*protos.Entity{ + {EntityType: "GPU", EntityValue: "0"}, + }, + } + + existingMap := healthEventsAnnotation.NewHealthEventsAnnotationMap() + existingMap.AddOrUpdateEvent(existingEvent) + existingBytes, err := json.Marshal(existingMap) + require.NoError(t, err) + + annotations := map[string]string{ + common.QuarantineHealthEventAnnotationKey: string(existingBytes), + common.QuarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"nvidia.com/gpu-error","Value":"true","Effect":"NoSchedule"}]`, + common.QuarantineHealthEventIsCordonedAnnotationKey: "True", + } + + taints := []corev1.Taint{ + {Key: "nvidia.com/gpu-error", Value: "true", Effect: "NoSchedule"}, + } + + createE2ETestNode(ctx, t, nodeName, annotations, nil, taints, true) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send healthy event - should unquarantine") + mockWatcher.EventsChan <- bson.M{ + "operationType": "insert", + "fullDocument": bson.M{ + "_id": primitive.NewObjectID(), + "healtheventstatus": bson.M{ + "nodequarantined": model.StatusInProgress, + }, + "healthevent": bson.M{ + "nodename": nodeName, + "agent": "agent1", + "componentclass": "GPU", + "checkname": "checkA", + "version": uint32(1), + "ishealthy": true, + "entitiesimpacted": []interface{}{ + bson.M{"entitytype": "GPU", "entityvalue": "0"}, + }, + }, + }, + } + + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + fqTaintCount := 0 + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-error" { + fqTaintCount++ + } + } + + return !node.Spec.Unschedulable && + node.Annotations[common.QuarantineHealthEventAnnotationKey] == "" && + fqTaintCount == 0 + }, eventuallyTimeout, eventuallyPollInterval, "Node should be unquarantined") + + t.Log("Verify all FQ annotations removed") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.Empty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey], "Quarantine annotation should be removed") + assert.Empty(t, node.Annotations[common.QuarantineHealthEventAppliedTaintsAnnotationKey], "Applied taints annotation should be removed") + assert.Empty(t, node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey], "Cordoned annotation should be removed") +} + +func TestE2E_RulesetNotMatching(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-nomatch-rule-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-fatal-only", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send event that doesn't match (wrong checkName)") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuMemWatch", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Verify node never gets quarantined (rule doesn't match) + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Node should not be quarantined when rule doesn't match") + + t.Log("Send event that partially matches (correct checkName but not fatal)") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + false, // Not fatal + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Verify node never gets quarantined (isFatal requirement not met) + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Node should not be quarantined when isFatal requirement not met") +} + +func TestE2E_PartialAnnotationUpdate(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + nodeName := "e2e-partial-ann-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, _, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Quarantine with GPU 0, 1, 2 (send sequentially to avoid race conditions)") + for i := 0; i < 3; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: fmt.Sprintf("%d", i)}}, + model.StatusInProgress, + ) + + // Wait for this GPU to be tracked before sending next event + expectedCount := i + 1 + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + if err := json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap); err != nil { + return false + } + return healthEventsMap.Count() == expectedCount + }, statusCheckTimeout, statusCheckPollInterval, "Should track %d GPU(s)", expectedCount) + } + + initialAnnotation := "" + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + initialAnnotation = node.Annotations[common.QuarantineHealthEventAnnotationKey] + + t.Log("Partial recovery of GPU 1 - annotation should be updated") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + nodeName, + "GpuXidError", + true, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + currentAnnotation := node.Annotations[common.QuarantineHealthEventAnnotationKey] + return currentAnnotation != initialAnnotation + }, statusCheckTimeout, statusCheckPollInterval, "Annotation should be updated for partial recovery") + + t.Log("Verify annotation content changed correctly - GPU 1 removed, GPU 0 and 2 remain") + node, err = e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + + var healthEventsMap healthEventsAnnotation.HealthEventsAnnotationMap + err = json.Unmarshal([]byte(node.Annotations[common.QuarantineHealthEventAnnotationKey]), &healthEventsMap) + require.NoError(t, err) + assert.Equal(t, 2, healthEventsMap.Count(), "Should have 2 entities remaining (GPU 0 and 2)") + assert.True(t, node.Spec.Unschedulable, "Node should remain quarantined") + + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "0") + verifyHealthEventInAnnotation(t, node, "GpuXidError", "gpu-health-monitor", "GPU", "GPU", "2") + gpu1Query := &protos.HealthEvent{ + Agent: "gpu-health-monitor", + ComponentClass: "GPU", + CheckName: "GpuXidError", + NodeName: nodeName, + Version: 1, + EntitiesImpacted: []*protos.Entity{ + {EntityType: "GPU", EntityValue: "1"}, + }, + } + _, found := healthEventsMap.GetEvent(gpu1Query) + assert.False(t, found, "GPU 1 should NOT be in annotation after partial recovery") +} + +func TestE2E_CircuitBreakerBasic(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + // Create 10 test nodes + baseNodeName := "e2e-cb-basic-" + primitive.NewObjectID().Hex()[:6] + for i := 0; i < 10; i++ { + nodeName := fmt.Sprintf("%s-%d", baseNodeName, i) + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func(name string) { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, name, metav1.DeleteOptions{}) + }(nodeName) + } + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + // Setup with circuit breaker enabled + r, mockWatcher, _, cb := setupE2EReconciler(t, ctx, tomlConfig, &breaker.CircuitBreakerConfig{ + Namespace: "default", + Percentage: 50, + Duration: 5 * time.Minute, + }) + + // Verify circuit breaker is initialized + t.Log("Verify circuit breaker is initialized") + require.NotNil(t, cb, "Circuit breaker should be initialized") + + // BLOCKING: Wait for all 10 nodes to be visible in NodeInformer cache + // This is critical for circuit breaker percentage calculations to be accurate + // Test will fail if nodes aren't visible within 5 seconds + require.Eventually(t, func() bool { + totalNodes, _, err := r.k8sClient.NodeInformer.GetNodeCounts() + return err == nil && totalNodes == 10 + }, statusCheckTimeout, statusCheckPollInterval, "NodeInformer should see all 10 nodes") + + t.Log("Cordoning 4 nodes (40%) - should not trip circuit breaker") + for i := 0; i < 4; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + fmt.Sprintf("%s-%d", baseNodeName, i), + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + } + + // Wait for all 4 nodes to be cordoned + require.Eventually(t, func() bool { + cordonedCount := 0 + for i := 0; i < 4; i++ { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, fmt.Sprintf("%s-%d", baseNodeName, i), metav1.GetOptions{}) + if err == nil && node.Spec.Unschedulable { + cordonedCount++ + } + } + return cordonedCount == 4 + }, statusCheckTimeout, statusCheckPollInterval, "4 nodes should be cordoned") + + isTripped, err := cb.IsTripped(ctx) + require.NoError(t, err) + assert.False(t, isTripped, "Circuit breaker should not trip at 40%") + + t.Log("Cordoning 5th node (50%) - should trip circuit breaker") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + fmt.Sprintf("%s-4", baseNodeName), + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Wait for 5th node to be cordoned + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, fmt.Sprintf("%s-4", baseNodeName), metav1.GetOptions{}) + return err == nil && node.Spec.Unschedulable + }, statusCheckTimeout, statusCheckPollInterval, "5th node should be cordoned") + + isTripped, err = cb.IsTripped(ctx) + require.NoError(t, err) + assert.True(t, isTripped, "Circuit breaker should trip at 50%") + + t.Log("Trying 6th node - should be blocked by circuit breaker") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + fmt.Sprintf("%s-5", baseNodeName), + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Verify 6th node never gets cordoned (circuit breaker blocks it) + assert.Never(t, func() bool { + sixthNode, err := e2eTestClient.CoreV1().Nodes().Get(ctx, fmt.Sprintf("%s-5", baseNodeName), metav1.GetOptions{}) + if err != nil { + return false + } + return sixthNode.Spec.Unschedulable + }, statusCheckTimeout, statusCheckPollInterval, "6th node should not be cordoned due to circuit breaker") +} + +func TestE2E_CircuitBreakerSlidingWindow(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + // Create 10 test nodes + baseNodeName := "e2e-cb-window-" + primitive.NewObjectID().Hex()[:6] + for i := 0; i < 10; i++ { + nodeName := fmt.Sprintf("%s-%d", baseNodeName, i) + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func(name string) { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, name, metav1.DeleteOptions{}) + }(nodeName) + } + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + // Setup with circuit breaker (short window for testing) + r, mockWatcher, _, cb := setupE2EReconciler(t, ctx, tomlConfig, &breaker.CircuitBreakerConfig{ + Namespace: "default", + Percentage: 50, + Duration: 2 * time.Second, // Short window for testing + }) + + t.Log("Verify circuit breaker is initialized") + require.NotNil(t, cb, "Circuit breaker should be initialized") + + // BLOCKING: Wait for all 10 nodes to be visible in NodeInformer cache + // This is critical for circuit breaker percentage calculations to be accurate + // Test will fail if nodes aren't visible within 5 seconds + require.Eventually(t, func() bool { + totalNodes, _, err := r.k8sClient.NodeInformer.GetNodeCounts() + return err == nil && totalNodes == 10 + }, statusCheckTimeout, statusCheckPollInterval, "NodeInformer should see all 10 nodes") + + t.Log("Cordoning 5 nodes to trip the circuit breaker") + for i := 0; i < 5; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + fmt.Sprintf("%s-%d", baseNodeName, i), + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + } + + // Wait for all 5 nodes to be cordoned + require.Eventually(t, func() bool { + cordonedCount := 0 + for i := 0; i < 5; i++ { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, fmt.Sprintf("%s-%d", baseNodeName, i), metav1.GetOptions{}) + if err == nil && node.Spec.Unschedulable { + cordonedCount++ + } + } + return cordonedCount == 5 + }, statusCheckTimeout, statusCheckPollInterval, "5 nodes should be cordoned") + + isTripped, err := cb.IsTripped(ctx) + require.NoError(t, err) + assert.True(t, isTripped, "Circuit breaker should trip") + + t.Log("Forcing circuit breaker to CLOSED and waiting for window to expire") + err = cb.ForceState(ctx, "CLOSED") + require.NoError(t, err) + + // Wait for sliding window to fully expire (2 second window + buffer) + time.Sleep(3 * time.Second) + + // Now check - should not trip since window has expired + isTripped, err = cb.IsTripped(ctx) + require.NoError(t, err) + assert.False(t, isTripped, "Circuit breaker should not be tripped after sliding window expires") +} + +func TestE2E_CircuitBreakerUniqueNodeTracking(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second) + defer cancel() + + // Create 10 test nodes + baseNodeName := "e2e-cb-unique-" + primitive.NewObjectID().Hex()[:6] + for i := 0; i < 10; i++ { + nodeName := fmt.Sprintf("%s-%d", baseNodeName, i) + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func(name string) { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, name, metav1.DeleteOptions{}) + }(nodeName) + } + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "true"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + // Setup with circuit breaker enabled + r, mockWatcher, _, cb := setupE2EReconciler(t, ctx, tomlConfig, &breaker.CircuitBreakerConfig{ + Namespace: "default", + Percentage: 50, + Duration: 5 * time.Minute, + }) + + t.Log("Verify circuit breaker is initialized") + require.NotNil(t, cb, "Circuit breaker should be initialized") + + t.Log("Waiting for all nodes to be visible in NodeInformer cache") + require.Eventually(t, func() bool { + totalNodes, _, err := r.k8sClient.NodeInformer.GetNodeCounts() + return err == nil && totalNodes == 10 + }, statusCheckTimeout, statusCheckPollInterval, "NodeInformer should see all 10 nodes") + + t.Log("Sending first event for node 0 to test unique node tracking") + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + fmt.Sprintf("%s-0", baseNodeName), + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + // Wait for node 0 to be cordoned + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, fmt.Sprintf("%s-0", baseNodeName), metav1.GetOptions{}) + return err == nil && node.Spec.Unschedulable + }, statusCheckTimeout, statusCheckPollInterval, "Node 0 should be cordoned") + + t.Log("Sending 9 duplicate events for same node (testing deduplication)") + for i := 1; i < 10; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + fmt.Sprintf("%s-0", baseNodeName), + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + } + + isTripped, err := cb.IsTripped(ctx) + require.NoError(t, err) + assert.False(t, isTripped, "Circuit breaker should not trip with only 1 unique node") + + t.Log("Adding 4 more unique nodes to reach 5 total (50% threshold)") + for i := 1; i <= 4; i++ { + mockWatcher.EventsChan <- createHealthEventBSON( + primitive.NewObjectID(), + fmt.Sprintf("%s-%d", baseNodeName, i), + "TestCheck", + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + } + + // Wait for all 5 nodes to be cordoned + require.Eventually(t, func() bool { + cordonedCount := 0 + for i := 0; i < 5; i++ { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, fmt.Sprintf("%s-%d", baseNodeName, i), metav1.GetOptions{}) + if err == nil && node.Spec.Unschedulable { + cordonedCount++ + } + } + return cordonedCount == 5 + }, statusCheckTimeout, statusCheckPollInterval, "5 nodes should be cordoned") + + isTripped, err = cb.IsTripped(ctx) + require.NoError(t, err) + assert.True(t, isTripped, "Circuit breaker should trip with 5 unique nodes (50%)") +} + +func TestE2E_QuarantineOverridesForce(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-force-quarantine-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "should-not-match", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "false"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/test", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send event with QuarantineOverrides.Force=true (bypasses rule evaluation)") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- bson.M{ + "operationType": "insert", + "fullDocument": bson.M{ + "_id": eventID1, + "healtheventstatus": bson.M{ + "nodequarantined": model.StatusInProgress, + }, + "healthevent": bson.M{ + "nodename": nodeName, + "agent": "test-agent", + "componentclass": "GPU", + "checkname": "TestCheck", + "version": uint32(1), + "ishealthy": false, + "message": "Force quarantine for maintenance", + "metadata": bson.M{ + "creator_id": "user123", + }, + "quarantineoverrides": bson.M{ + "force": true, + }, + }, + }, + } + + // Verify status is Quarantined (even though rule doesn't match) + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined with force override") + + t.Log("Verify node is cordoned with special labels") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable && + node.Labels["k8s.nvidia.com/cordon-by"] == "test-agent-user123" && + node.Labels["k8s.nvidia.com/cordon-reason"] == "Force-quarantine-for-maintenance" + }, eventuallyTimeout, eventuallyPollInterval, "Node should be force quarantined with special labels") +} + +func TestE2E_NodeRuleEvaluator(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-node-rule-" + primitive.NewObjectID().Hex()[:8] + + // Create node with specific label + labels := map[string]string{ + "k8saas.nvidia.com/ManagedByNVSentinel": "true", + } + + createE2ETestNode(ctx, t, nodeName, nil, labels, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "managed-nodes-only", + Version: "1", + Priority: 10, + Match: config.Match{ + All: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + {Kind: "Node", Expression: "node.metadata.labels['k8saas.nvidia.com/ManagedByNVSentinel'] == 'true'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send event - should match both HealthEvent and Node rules") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is Quarantined (Node rule matched)") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined when Node rule matches") + + require.Eventually(t, func() bool { + node, _ := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be quarantined when Node rule matches") +} + +func TestE2E_NodeRuleDoesNotMatch(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-node-nomatch-" + primitive.NewObjectID().Hex()[:8] + + // Create node WITHOUT the required label + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "managed-nodes-only", + Version: "1", + Priority: 10, + Match: config.Match{ + All: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + {Kind: "Node", Expression: "node.metadata.labels['k8saas.nvidia.com/ManagedByNVSentinel'] == 'true'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send event - Node rule should NOT match (label missing)") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is nil (rule didn't match)") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status == nil + }, statusCheckTimeout, statusCheckPollInterval, "Status should be nil when Node rule doesn't match") + + t.Log("Verify node is NOT quarantined") + assert.Never(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + return node.Spec.Unschedulable + }, neverTimeout, neverPollInterval, "Node should not be quarantined when Node rule doesn't match") +} + +func TestE2E_TaintWithoutCordon(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-taint-no-cordon-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "taint-only-rule", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: false}, // No cordon + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Sending taint-only event (no cordon)") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Waiting for Quarantined status") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined") + + t.Log("Verify node is tainted but NOT cordoned") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + hasTaint := false + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-xid-error" { + hasTaint = true + break + } + } + + return hasTaint && !node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be tainted but not cordoned") + + t.Log("Verify quarantine annotation exists but NOT cordon annotation") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.NotEmpty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey]) + assert.Empty(t, node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey], "Cordon annotation should not exist") +} + +func TestE2E_CordonWithoutTaint(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-cordon-no-taint-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "cordon-only-rule", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{}, // No taint + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Sending cordon-only event (no taint)") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is Quarantined") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined") + + t.Log("Verify node is cordoned but has NO FQ taints") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + return node.Spec.Unschedulable + }, eventuallyTimeout, eventuallyPollInterval, "Node should be cordoned") + + t.Log("Verify no FQ taints (cordon-only)") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + fqTaintCount := 0 + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/test" { + fqTaintCount++ + } + } + assert.Equal(t, 0, fqTaintCount, "Should have no FQ taints") + assert.NotEmpty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey]) + assert.Equal(t, "True", node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey]) + assert.Empty(t, node.Annotations[common.QuarantineHealthEventAppliedTaintsAnnotationKey], "Applied taints annotation should be empty") +} + +func TestE2E_ManualUncordonAnnotationCleanup(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-manual-cleanup-" + primitive.NewObjectID().Hex()[:8] + + // Create node with manual uncordon annotation (from previous manual uncordon) + annotations := map[string]string{ + common.QuarantinedNodeUncordonedManuallyAnnotationKey: common.QuarantinedNodeUncordonedManuallyAnnotationValue, + } + + createE2ETestNode(ctx, t, nodeName, annotations, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send unhealthy event - should remove manual uncordon annotation and quarantine") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is Quarantined") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined") + + t.Log("Verify manual uncordon annotation is removed and FQ annotations added") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + return node.Spec.Unschedulable && + node.Annotations[common.QuarantineHealthEventAnnotationKey] != "" && + node.Annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey] == "" + }, eventuallyTimeout, eventuallyPollInterval, "Manual uncordon annotation should be removed, FQ annotations added") +} + +func TestE2E_UnhealthyEventOnQuarantinedNodeNoRuleMatch(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-q-node-nomatch-" + primitive.NewObjectID().Hex()[:8] + + // Create node already quarantined + existingEvent := &protos.HealthEvent{ + NodeName: nodeName, + Agent: "gpu-health-monitor", + CheckName: "GpuXidError", + ComponentClass: "GPU", + Version: 1, + IsHealthy: false, + EntitiesImpacted: []*protos.Entity{ + {EntityType: "GPU", EntityValue: "0"}, + }, + } + + existingMap := healthEventsAnnotation.NewHealthEventsAnnotationMap() + existingMap.AddOrUpdateEvent(existingEvent) + existingBytes, err := json.Marshal(existingMap) + require.NoError(t, err) + + annotations := map[string]string{ + common.QuarantineHealthEventAnnotationKey: string(existingBytes), + common.QuarantineHealthEventIsCordonedAnnotationKey: "True", + } + + createE2ETestNode(ctx, t, nodeName, annotations, nil, nil, true) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-only", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + initialAnnotation := string(existingBytes) + + t.Log("Send unhealthy event for different check that doesn't match any rules") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuMemWatch", // Different check - doesn't match rule + false, + false, + []*protos.Entity{{EntityType: "GPU", EntityValue: "1"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is AlreadyQuarantined (node stays quarantined but event doesn't match rules)") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.AlreadyQuarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be AlreadyQuarantined") + + t.Log("Verify annotation is NOT updated (event doesn't match rules, so not added)") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.Equal(t, initialAnnotation, node.Annotations[common.QuarantineHealthEventAnnotationKey], "Annotation should not change for non-matching rule") +} + +func TestE2E_DryRunMode(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-dryrun-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "gpu-xid-errors", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + // Setup with DryRun=true (circuit breaker disabled) + _, mockWatcher, getStatus, _ := setupE2EReconcilerWithOptions(t, ctx, E2EReconcilerConfig{ + TomlConfig: tomlConfig, + CircuitBreakerConfig: nil, + DryRun: true, + }) + + t.Log("Sending event in dry-run mode") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is Quarantined (dry run still returns status)") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined in dry run") + + t.Log("Verify node is NOT actually cordoned or tainted (dry run)") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.False(t, node.Spec.Unschedulable, "Node should NOT be cordoned in dry run mode") + + // Annotations ARE added in dry run (only spec changes are skipped) + assert.NotEmpty(t, node.Annotations[common.QuarantineHealthEventAnnotationKey], "Annotations are still added in dry run") +} + +func TestE2E_TaintOnlyThenCordonRule(t *testing.T) { + ctx, cancel := context.WithTimeout(e2eTestContext, 20*time.Second) + defer cancel() + + nodeName := "e2e-taint-then-cordon-" + primitive.NewObjectID().Hex()[:8] + createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false) + defer func() { + _ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) + }() + + tomlConfig := config.TomlConfig{ + LabelPrefix: "k8s.nvidia.com/", + RuleSets: []config.RuleSet{ + { + Name: "taint-first", + Version: "1", + Priority: 5, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"}, + }, + }, + Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"}, + Cordon: config.Cordon{ShouldCordon: false}, + }, + { + Name: "cordon-second", + Version: "1", + Priority: 10, + Match: config.Match{ + Any: []config.Rule{ + {Kind: "HealthEvent", Expression: "event.isFatal == true"}, + }, + }, + Taint: config.Taint{}, + Cordon: config.Cordon{ShouldCordon: true}, + }, + }, + } + + _, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil) + + t.Log("Send fatal XID error - both rules match (taint + cordon)") + eventID1 := primitive.NewObjectID() + mockWatcher.EventsChan <- createHealthEventBSON( + eventID1, + nodeName, + "GpuXidError", + false, + true, + []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, + model.StatusInProgress, + ) + + t.Log("Verify status is Quarantined") + require.Eventually(t, func() bool { + status := getStatus(eventID1) + return status != nil && *status == model.Quarantined + }, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined") + + t.Log("Verify node has BOTH taint AND cordon") + require.Eventually(t, func() bool { + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false + } + + hasTaint := false + for _, taint := range node.Spec.Taints { + if taint.Key == "nvidia.com/gpu-xid-error" { + hasTaint = true + break + } + } + + return node.Spec.Unschedulable && hasTaint + }, eventuallyTimeout, eventuallyPollInterval, "Node should have both taint and cordon") + + t.Log("Verify both annotations exist") + node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + require.NoError(t, err) + assert.NotEmpty(t, node.Annotations[common.QuarantineHealthEventAppliedTaintsAnnotationKey], "Applied taints annotation should exist") + assert.Equal(t, "True", node.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey], "Cordon annotation should exist") +} diff --git a/fault-quarantine-module/pkg/reconciler/reconciler_test.go b/fault-quarantine-module/pkg/reconciler/reconciler_test.go deleted file mode 100644 index eb8d55971..000000000 --- a/fault-quarantine-module/pkg/reconciler/reconciler_test.go +++ /dev/null @@ -1,3326 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package reconciler - -import ( - "context" - "encoding/json" - "fmt" - "reflect" - "slices" - "strings" - "sync" - "testing" - "time" - - "github.com/nvidia/nvsentinel/commons/pkg/statemanager" - "github.com/nvidia/nvsentinel/data-models/pkg/model" - "github.com/nvidia/nvsentinel/data-models/pkg/protos" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/common" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/config" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/evaluator" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/healthEventsAnnotation" - "github.com/nvidia/nvsentinel/fault-quarantine-module/pkg/informer" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" - "k8s.io/client-go/tools/cache" -) - -var ( - quarantineHealthEventAnnotationKey = common.QuarantineHealthEventAnnotationKey - quarantineHealthEventAppliedTaintsAnnotationKey = common.QuarantineHealthEventAppliedTaintsAnnotationKey - quarantineHealthEventIsCordonedAnnotationKey = common.QuarantineHealthEventIsCordonedAnnotationKey -) - -type mockK8sClient struct { - getNodeAnnotationsFn func(ctx context.Context, nodeName string) (map[string]string, error) - getNodesWithAnnotationFn func(ctx context.Context, annotationKey string) ([]string, error) - taintAndCordonNodeFn func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error - unTaintAndUnCordonNodeFn func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error - updateNodeAnnotationsFn func(ctx context.Context, nodeName string, annotations map[string]string) error - getK8sClientFn func() kubernetes.Interface - ensureConfigMapFn func(ctx context.Context, name, namespace string, initialStatus string) error - readCBStateFn func(ctx context.Context, name, namespace string) (string, error) - writeCBStateFn func(ctx context.Context, name, namespace, status string) error - getTotalGpuNodesFn func(ctx context.Context) (int, error) -} - -func (m *mockK8sClient) GetNodeAnnotations(ctx context.Context, nodeName string) (map[string]string, error) { - return m.getNodeAnnotationsFn(ctx, nodeName) -} -func (m *mockK8sClient) GetNodesWithAnnotation(ctx context.Context, annotationKey string) ([]string, error) { - return m.getNodesWithAnnotationFn(ctx, annotationKey) -} -func (m *mockK8sClient) TaintAndCordonNodeAndSetAnnotations(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelsMap map[string]string) error { - return m.taintAndCordonNodeFn(ctx, nodeName, taints, isCordon, annotations, labelsMap) -} -func (m *mockK8sClient) UnTaintAndUnCordonNodeAndRemoveAnnotations(ctx context.Context, nodeName string, taints []config.Taint, isUnCordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - return m.unTaintAndUnCordonNodeFn(ctx, nodeName, taints, isUnCordon, annotationKeys, labelsToRemove, labelMap) -} - -func (m *mockK8sClient) UpdateNodeAnnotations(ctx context.Context, nodeName string, annotations map[string]string) error { - return m.updateNodeAnnotationsFn(ctx, nodeName, annotations) -} - -func (m *mockK8sClient) GetK8sClient() kubernetes.Interface { - return m.getK8sClientFn() -} - -func (m *mockK8sClient) EnsureCircuitBreakerConfigMap(ctx context.Context, name, namespace string, initialStatus string) error { - if m.ensureConfigMapFn != nil { - return m.ensureConfigMapFn(ctx, name, namespace, initialStatus) - } - return nil -} - -func (m *mockK8sClient) ReadCircuitBreakerState(ctx context.Context, name, namespace string) (string, error) { - if m.readCBStateFn != nil { - return m.readCBStateFn(ctx, name, namespace) - } - return "", nil -} - -func (m *mockK8sClient) WriteCircuitBreakerState(ctx context.Context, name, namespace, status string) error { - if m.writeCBStateFn != nil { - return m.writeCBStateFn(ctx, name, namespace, status) - } - return nil -} - -func (m *mockK8sClient) GetTotalGpuNodes(ctx context.Context) (int, error) { - if m.getTotalGpuNodesFn != nil { - return m.getTotalGpuNodesFn(ctx) - } - return 10, nil // Default value for tests -} - -type mockEvaluator struct { - name string - ok bool - evalErr error - priority int - version string - ruleEvalResult common.RuleEvaluationResult -} - -func (m *mockEvaluator) GetName() string { - return m.name -} - -func (m *mockEvaluator) Evaluate(event *protos.HealthEvent) (common.RuleEvaluationResult, error) { - return m.ruleEvalResult, m.evalErr -} - -func (m *mockEvaluator) GetPriority() int { - return m.priority -} - -func (m *mockEvaluator) GetVersion() string { - return m.version -} - -func TestHandleEvent(t *testing.T) { - ctx := context.Background() - - tomlConfig := config.TomlConfig{ - LabelPrefix: "k88s.nvidia.com/", - RuleSets: []config.RuleSet{ - { - Name: "ruleset1", - Taint: config.Taint{ - Key: "key1", - Value: "val1", - Effect: "NoSchedule", - }, - Cordon: config.Cordon{ShouldCordon: false}, - Priority: 10, - }, - { - Name: "ruleset2", - Taint: config.Taint{ - Key: "key2", - Value: "val2", - Effect: "NoExecute", - }, - Cordon: config.Cordon{ShouldCordon: true}, - Priority: 5, - }, - }, - } - - circuitBreakerConfig := CircuitBreakerConfig{ - Namespace: "nvsentinel", - Name: "fault-quarantine-circuit-breaker", - Percentage: 50, - Duration: 5 * time.Minute, - } - - cfg := ReconcilerConfig{ - TomlConfig: tomlConfig, - CircuitBreaker: circuitBreakerConfig, - K8sClient: &mockK8sClient{ - getNodesWithAnnotationFn: func(ctx context.Context, annotationKey string) ([]string, error) { - // Initially no quarantined nodes - return []string{}, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelsMap map[string]string) error { - // ensure it is called with correct parameters - if nodeName != "node1" { - t.Errorf("Expected node name node1, got %s", nodeName) - } - // We know from these rules one taint and cordon should happen - if len(taints) != 2 { - t.Errorf("Expected 2 taints to be applied, got %d", len(taints)) - } - if !isCordon { - t.Errorf("Expected node to be cordoned") - } - if _, ok := annotations[common.QuarantineHealthEventAnnotationKey]; !ok { - t.Errorf("Expected quarantineHealthEvent annotation to be set") - } - if len(labelsMap) != 4 { - t.Errorf("Expected cordon labels to be applied on node %s", nodeName) - } - return nil - }, - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return map[string]string{}, nil - }, - }, - } - - r := NewReconciler(ctx, cfg, nil) - r.SetLabelKeys(cfg.TomlConfig.LabelPrefix) - - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{ - &mockEvaluator{name: "ruleset1", ok: true}, // applies taint key1=val1 - &mockEvaluator{name: "ruleset2", ok: true}, // applies taint key2=val2 and cordon - } - - event := &protos.HealthEvent{ - NodeName: "node1", - } - - // Create a wrapper around the health event - healthEventWithStatus := &model.HealthEventWithStatus{ - HealthEvent: event, - } - - isQuarantined, ruleEvalResult := r.handleEvent(ctx, healthEventWithStatus, ruleSetEvals, - rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{ - "ruleset1": &tomlConfig.RuleSets[0].Taint, - "ruleset2": &tomlConfig.RuleSets[1].Taint, - }, - CordonConfigMap: map[string]bool{ - "ruleset1": false, - "ruleset2": true, - }, - RuleSetPriorityMap: map[string]int{ - "ruleset1": 10, - "ruleset2": 5, - }, - }, - ) - - if isQuarantined == nil { - t.Errorf("Expected isQuarantined to be non-nil") - } - - if isQuarantined != nil && *isQuarantined == model.UnQuarantined { - t.Errorf("Node should be quarantined due to rules") - } - - // Check the rule evaluation results - if ruleEvalResult == common.RuleEvaluationRetryAgainInFuture { - t.Errorf("Unexpected rule kind result: %v", ruleEvalResult) - } - - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if !quarantinedNodes["node1"] { - t.Errorf("Expected quarantinedNodesMap[node1] to be true") - } -} - -// Test handleEvent with no rules triggered -func TestHandleEventNoRulesTriggered(t *testing.T) { - ctx := context.Background() - cfg := ReconcilerConfig{ - TomlConfig: config.TomlConfig{ - RuleSets: []config.RuleSet{}, - }, - K8sClient: &mockK8sClient{ - getNodesWithAnnotationFn: func(ctx context.Context, annotationKey string) ([]string, error) { - return []string{}, nil - }, - // Should not be called in this scenario - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelsMap map[string]string) error { - t.Errorf("TaintAndCordonNodeAndSetAnnotations should not be called when no rules triggered.") - return nil - }, - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return map[string]string{}, nil - }, - }, - } - - r := NewReconciler(ctx, cfg, nil) - - // Initialize label keys - r.SetLabelKeys(cfg.TomlConfig.LabelPrefix) - - event := &protos.HealthEvent{ - NodeName: "node1", - } - - // Create a wrapper around the health event - healthEventWithStatus := &model.HealthEventWithStatus{ - HealthEvent: event, - } - - isQuarantined, ruleEvalResult := r.handleEvent(ctx, healthEventWithStatus, []evaluator.RuleSetEvaluatorIface{}, rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{}, - CordonConfigMap: map[string]bool{}, - RuleSetPriorityMap: map[string]int{}, - }) - - if isQuarantined != nil { - t.Errorf("Expected isQuarantined to be nil") - } - - if ruleEvalResult != common.RuleEvaluationNotApplicable { - t.Errorf("Expected HealthEventRuleNotApplicable rule kind, got %v", ruleEvalResult) - } -} - -// Test handleQuarantinedNode: scenario where unquarantine should occur -func TestHandleQuarantinedNodeUnquarantine(t *testing.T) { - ctx := context.Background() - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: `[{ - "NodeName":"node1", - "CheckName":"GpuNvLinkWatch", - "Agent":"agent1", - "Version":1, - "ComponentClass":"GPU", - "EntitiesImpacted":[{"EntityType":"GPU","EntityValue":"0"}] - }]`, - quarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"key1","Value":"val1","Effect":"NoSchedule"}]`, - quarantineHealthEventIsCordonedAnnotationKey: "True", - } - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - // Update the annotations map for subsequent reads - for k, v := range annotations { - annotationsMap[k] = v - } - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - // Check that correct taints and annotations are removed - if nodeName != "node1" { - t.Errorf("Expected node name node1, got %s", nodeName) - } - if len(taints) != 1 { - t.Errorf("Expected 1 taint to remove") - } - if !isUncordon { - t.Errorf("Expected node to be uncordoned") - } - expectedKeys := map[string]bool{ - quarantineHealthEventAnnotationKey: true, - quarantineHealthEventAppliedTaintsAnnotationKey: true, - quarantineHealthEventIsCordonedAnnotationKey: true, - } - for _, k := range annotationKeys { - if !expectedKeys[k] { - t.Errorf("Unexpected annotation key removed: %s", k) - } - } - if len(labelMap) != 2 { - t.Errorf("Expected uncordon labels to be applied on node %s", nodeName) - } - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{ - K8sClient: k8sMock, - }, nil) - - // Initialize label keys - r.SetLabelKeys("k88s.nvidia.com/") - - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, true) - - event := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "GpuNvLinkWatch", // Must match the annotation - ComponentClass: "GPU", // Must match the annotation - Version: 1, - IsHealthy: true, // triggers unquarantine comparison - EntitiesImpacted: []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, // Must match annotation - } - - isQuarantined := r.handleQuarantinedNode(ctx, event) - if isQuarantined { - t.Errorf("Expected node to be unquarantined") - } - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if quarantinedNodes["node1"] { - t.Errorf("quarantinedNodesMap[node1] should be false after unquarantine") - } -} - -// Test handleQuarantinedNode: scenario where node stays quarantined -func TestHandleQuarantinedNodeNoUnquarantine(t *testing.T) { - ctx := context.Background() - // The annotation event differs from incoming event - no unquarantine - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: `[{ - "NodeName":"node1", - "CheckName":"GpuNvLinkWatch", - "Agent":"agent1", - "Version":1, - "IsHealthy":true, - "ComponentClass":"GPU", - "EntitiesImpacted":[{"EntityType":"GPU","EntityValue":"0"}] - }]`, - } - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - // Update the annotations map for subsequent reads - for k, v := range annotations { - annotationsMap[k] = v - } - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - t.Errorf("Should not be called if no unquarantine needed") - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{ - K8sClient: k8sMock, - }, nil) - - // Initialize label keys - r.SetLabelKeys("k88s.nvidia.com/") - - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, false) - - event := &protos.HealthEvent{ - NodeName: "node1", - Agent: "gpu-health-monitor", // Different agent should not match - CheckName: "GpuNvLinkWatch", - Version: 1, - IsHealthy: true, - EntitiesImpacted: []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}}, - } - - isQuarantined := r.handleQuarantinedNode(ctx, event) - if !isQuarantined { - t.Errorf("Expected node to remain quarantined") - } - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if !quarantinedNodes["node1"] { - t.Errorf("quarantinedNodesMap[node1] should still be true") - } -} - -// Test: Node is first uncordoned (manually), then a health event is sent, and annotation should be removed. -func TestHandleEvent_ManualUncordonThenHealthEvent(t *testing.T) { - ctx := context.Background() - - // Step 1: Node is cordoned and annotation is set (simulate FQM quarantine) - originalEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - ComponentClass: "class1", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: func() string { b, _ := json.Marshal(originalEvent); return string(b) }(), - quarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"key1","Value":"val1","Effect":"NoSchedule"}]`, - quarantineHealthEventIsCordonedAnnotationKey: "True", - } - - // Step 2: Node is manually uncordoned (simulate by setting Unschedulable=false and annotation still present) - annotationRemoved := false - var removedAnnotationKeys []string - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // Simulate annotation still present after manual uncordon - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - // Update the annotations map for subsequent reads - for k, v := range annotations { - annotationsMap[k] = v - } - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - annotationRemoved = true - removedAnnotationKeys = annotationKeys - return nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - // Should not be called in this scenario - t.Errorf("TaintAndCordonNodeAndSetAnnotations should not be called in manual uncordon/annotation removal test") - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - r.SetLabelKeys("k88s.nvidia.com/") - - // Step 3: Node is manually uncordoned (simulate by removing from cache) - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", false, false) - - // Step 4: Health event is sent (simulate a healthy event matching the annotation) - healthEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - ComponentClass: "class1", - Version: 1, - IsHealthy: true, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - healthEventWithStatus := &model.HealthEventWithStatus{HealthEvent: healthEvent} - - status, _ := r.handleEvent(ctx, healthEventWithStatus, nil, rulesetsConfig{}) - - if status == nil { - t.Fatalf("Expected non-nil status when node is manually uncordoned and annotation should be removed") - } - if *status != model.UnQuarantined { - t.Errorf("Expected status UnQuarantined after manual uncordon and annotation removal, got %v", *status) - } - if !annotationRemoved { - t.Errorf("Expected UnTaintAndUnCordonNodeAndRemoveAnnotations to be called to remove annotation") - } - expectedKeys := map[string]bool{ - quarantineHealthEventAnnotationKey: true, - quarantineHealthEventAppliedTaintsAnnotationKey: true, - quarantineHealthEventIsCordonedAnnotationKey: true, - } - for _, k := range removedAnnotationKeys { - if !expectedKeys[k] { - t.Errorf("Unexpected annotation key removed: %s", k) - } - } - // The cache must reflect that the node is no longer quarantined - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if quarantinedNodes["node1"] { - t.Errorf("Expected node to be removed from quarantined cache after annotation removal") - } -} - -// TestMultiGPUPartialHealthyEvent tests that partial healthy events for multi-GPU nodes -// do not trigger uncordoning when some GPUs are still unhealthy -func TestMultiGPUPartialHealthyEvent(t *testing.T) { - ctx := context.Background() - - // Annotation event shows 8 GPUs with errors (GPU 0-7) - annotationEvent := &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: false, - Message: "GPU NvLink link is currently down", - ErrorCode: []string{"DCGM_FR_NVLINK_DOWN"}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - {EntityType: "GPU", EntityValue: "1"}, - {EntityType: "GPU", EntityValue: "2"}, - {EntityType: "GPU", EntityValue: "3"}, - {EntityType: "GPU", EntityValue: "4"}, - {EntityType: "GPU", EntityValue: "5"}, - {EntityType: "GPU", EntityValue: "6"}, - {EntityType: "GPU", EntityValue: "7"}, - }, - } - annotationEventStr, _ := json.Marshal(annotationEvent) - - // Setup mock K8s client with existing quarantine annotation - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: string(annotationEventStr), - quarantineHealthEventIsCordonedAnnotationKey: "True", - quarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"test","Value":"test","Effect":"NoSchedule"}]`, - } - - updateAnnotationsCalled := false - uncordonCalled := false - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - updateAnnotationsCalled = true - // Update the mock's annotations for subsequent calls - for k, v := range annotations { - annotationsMap[k] = v - } - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - uncordonCalled = true - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, true) - - // Test 1: Partial healthy event (only GPU 4 recovers) - partialHealthyEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: true, - Message: "GPU NvLink watch reported no errors", - ErrorCode: []string{}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "4"}, - }, - }, - } - - status, _ := r.handleEvent(ctx, partialHealthyEvent, nil, rulesetsConfig{}) - - // Should update annotation but NOT uncordon - if !updateAnnotationsCalled { - t.Errorf("Expected annotation to be updated for partial recovery") - } - if uncordonCalled { - t.Errorf("Expected node to remain cordoned for partial GPU recovery") - } - if status == nil || *status != model.AlreadyQuarantined { - t.Errorf("Expected AlreadyQuarantined status for partial recovery, got %v", status) - } - - // Reset flags - updateAnnotationsCalled = false - uncordonCalled = false - - // Test 2: Full healthy event (all GPUs recover) - fullHealthyEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: true, - Message: "GPU NvLink watch reported no errors", - ErrorCode: []string{}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - {EntityType: "GPU", EntityValue: "1"}, - {EntityType: "GPU", EntityValue: "2"}, - {EntityType: "GPU", EntityValue: "3"}, - {EntityType: "GPU", EntityValue: "4"}, - {EntityType: "GPU", EntityValue: "5"}, - {EntityType: "GPU", EntityValue: "6"}, - {EntityType: "GPU", EntityValue: "7"}, - }, - }, - } - - // Update the mock to simulate GPU 4 already removed from annotation - updatedAnnotation := &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: false, - Message: "GPU NvLink link is currently down", - ErrorCode: []string{"DCGM_FR_NVLINK_DOWN"}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - {EntityType: "GPU", EntityValue: "1"}, - {EntityType: "GPU", EntityValue: "2"}, - {EntityType: "GPU", EntityValue: "3"}, - // GPU 4 removed - {EntityType: "GPU", EntityValue: "5"}, - {EntityType: "GPU", EntityValue: "6"}, - {EntityType: "GPU", EntityValue: "7"}, - }, - } - updatedAnnotationStr, _ := json.Marshal(updatedAnnotation) - annotationsMap[quarantineHealthEventAnnotationKey] = string(updatedAnnotationStr) - - status, _ = r.handleEvent(ctx, fullHealthyEvent, nil, rulesetsConfig{}) - - // Should trigger uncordon when all GPUs are healthy - if !uncordonCalled { - t.Errorf("Expected node to be uncordoned when all GPUs recover") - } - if status == nil || *status != model.UnQuarantined { - t.Errorf("Expected UnQuarantined status for full recovery, got %v", status) - } -} - -// TestSkipRedundantCordoning tests that redundant cordoning is skipped when node is already cordoned -func TestSkipRedundantCordoning(t *testing.T) { - ctx := context.Background() - - // Existing annotation for GpuNvlinkWatch - existingEvent := &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - Message: "GPU 7's NvLink link 15 is currently down Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics", - } - existingAnnotationStr, _ := json.Marshal(existingEvent) - - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: string(existingAnnotationStr), - quarantineHealthEventIsCordonedAnnotationKey: "True", - } - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - // Update the annotations map for subsequent reads - for k, v := range annotations { - annotationsMap[k] = v - } - return nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - // Should not be called for redundant cordoning - t.Errorf("TaintAndCordonNode should not be called when node is already cordoned for different check") - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - - // New event with different checkName - newEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuMemWatch", // Different check - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "1"}, - }, - Message: "GPU 123456 had uncorrectable memory errors and row remapping failed. Run a field diagnostic on the GPU", - }, - } - - status, _ := r.handleEvent(ctx, newEvent, nil, rulesetsConfig{}) - - if status == nil || *status != model.AlreadyQuarantined { - t.Errorf("Expected AlreadyQuarantined status for redundant cordoning, got %v", status) - } -} - -// TestSkipDuplicateUnhealthyEntities tests that duplicate unhealthy events for already tracked entities are skipped -func TestSkipDuplicateUnhealthyEntities(t *testing.T) { - ctx := context.Background() - - // Create new format annotation with GPU 0 and GPU 1 having errors - existingMap := healthEventsAnnotation.NewHealthEventsAnnotationMap() - existingEvent := &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - {EntityType: "GPU", EntityValue: "1"}, - }, - Message: "GPU 0's NvLink link 15 is currently down Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics", - } - existingMap.AddOrUpdateEvent(existingEvent) - existingAnnotationStr, _ := json.Marshal(existingMap) - - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: string(existingAnnotationStr), - quarantineHealthEventIsCordonedAnnotationKey: "True", - } - - updateCalled := false - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - updateCalled = true - for k, v := range annotations { - annotationsMap[k] = v - } - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, true) - - // New event with same entities (GPU 0) - should be skipped - duplicateEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, // Already tracked - }, - Message: "GPU 0's NvLink link 15 is currently down Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics", - }, - } - - status, _ := r.handleEvent(ctx, duplicateEvent, nil, rulesetsConfig{}) - - if !updateCalled { - // Good - update should not be called for duplicate entities - } else { - t.Errorf("UpdateNodeAnnotations should not be called for duplicate entities") - } - - if status == nil || *status != model.AlreadyQuarantined { - t.Errorf("Expected AlreadyQuarantined status for duplicate entities, got %v", status) - } - - // Now test with a mix of existing and new entities - updateCalled = false - mixedEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - CheckName: "GpuNvlinkWatch", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "1"}, // Already tracked - {EntityType: "GPU", EntityValue: "2"}, // New entity - }, - Message: "GPU 0's NvLink link 15 is currently down Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics", - }, - } - - status, _ = r.handleEvent(ctx, mixedEvent, nil, rulesetsConfig{}) - - if !updateCalled { - t.Errorf("UpdateNodeAnnotations should be called when new entities are present") - } - - if status == nil || *status != model.AlreadyQuarantined { - t.Errorf("Expected AlreadyQuarantined status after updating with new entities, got %v", status) - } -} - -// TestHandleEventRuleEvaluationRetry tests handleEvent when an evaluator returns RuleEvaluationRetryAgainInFuture -// TestHandleHealthyEventWithoutQuarantineAnnotation tests that healthy events -// without existing quarantine annotations are skipped -func TestHandleHealthyEventWithoutQuarantineAnnotation(t *testing.T) { - ctx := context.Background() - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // No quarantine annotations exist - return map[string]string{}, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - t.Error("TaintAndCordonNode should not be called for healthy events without quarantine annotation") - return nil - }, - } - - mockEvaluator := &mockEvaluator{ - name: "test-eval", - ok: true, - ruleEvalResult: common.RuleEvaluationSuccess, - priority: 1, - version: "v1", - } - - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{mockEvaluator} - - rulesetsConfig := rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{"test-eval": {Key: "test", Value: "test", Effect: "NoSchedule"}}, - CordonConfigMap: map[string]bool{"test-eval": true}, - RuleSetPriorityMap: map[string]int{"test-eval": 1}, - } - - r := NewReconciler(ctx, ReconcilerConfig{ - K8sClient: k8sMock, - }, nil) - - // Healthy event - event := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - IsHealthy: true, - }, - } - - status, ruleEval := r.handleEvent(ctx, event, ruleSetEvals, rulesetsConfig) - - // Status should be nil for healthy events without quarantine annotation - if status != nil { - t.Errorf("Expected nil status for healthy event without quarantine annotation, got %v", status) - } - - if ruleEval != common.RuleEvaluationNotApplicable { - t.Errorf("Expected RuleEvaluationNotApplicable, got %v", ruleEval) - } -} - -// TestHandleUnhealthyEventWithoutQuarantineAnnotation tests that unhealthy events -// without existing quarantine annotations are still processed -func TestHandleUnhealthyEventWithoutQuarantineAnnotation(t *testing.T) { - ctx := context.Background() - - taintAndCordonCalled := false - var addedLabels map[string]string - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // No quarantine annotations exist - return map[string]string{}, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - taintAndCordonCalled = true - addedLabels = labelMap - return nil - }, - } - - mockEvaluator := &mockEvaluator{ - name: "test-eval", - ok: true, - ruleEvalResult: common.RuleEvaluationSuccess, - priority: 1, - version: "v1", - } - - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{mockEvaluator} - - rulesetsConfig := rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{"test-eval": {Key: "test", Value: "test", Effect: "NoSchedule"}}, - CordonConfigMap: map[string]bool{"test-eval": true}, - RuleSetPriorityMap: map[string]int{"test-eval": 1}, - } - - circuitBreakerConfig := CircuitBreakerConfig{ - Namespace: "nvsentinel", - Name: "fault-quarantine-circuit-breaker", - Percentage: 50, - Duration: 5 * time.Minute, - } - - r := NewReconciler(ctx, ReconcilerConfig{ - K8sClient: k8sMock, - CircuitBreaker: circuitBreakerConfig, - }, nil) - - // Initialize label keys - r.SetLabelKeys("k88s.nvidia.com/") - - // Unhealthy event - event := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - IsHealthy: false, - }, - } - - status, _ := r.handleEvent(ctx, event, ruleSetEvals, rulesetsConfig) - - // Status should be Quarantined for unhealthy events that trigger rules - if status == nil || *status != model.Quarantined { - t.Errorf("Expected Quarantined status for unhealthy event, got %v", status) - } - - if !taintAndCordonCalled { - t.Error("Expected TaintAndCordonNode to be called for unhealthy event") - } - normalizedTime := time.Now().UTC().Format("2006-01-02T15-04-05Z") - expectedLabels := map[string]string{ - cordonedByLabelKey: common.ServiceName, - cordonedReasonLabelKey: "test-eval", - cordonedTimestampLabelKey: normalizedTime, - statemanager.NVSentinelStateLabelKey: string(statemanager.QuarantinedLabelValue), - } - if _, ok := addedLabels[cordonedTimestampLabelKey]; !ok { - t.Errorf("Missing expected label %s", cordonedTimestampLabelKey) - } - addedLabels[cordonedTimestampLabelKey] = normalizedTime - if !reflect.DeepEqual(addedLabels, expectedLabels) { - t.Errorf("Unexpected set of labels added in TaintAndCordonNodeAndSetAnnotations: %v compared to %v", addedLabels, expectedLabels) - } -} - -func TestHandleEventRuleEvaluationRetry(t *testing.T) { - ctx := context.Background() - - // Create base configuration - cfg := ReconcilerConfig{ - TomlConfig: config.TomlConfig{ - LabelPrefix: "k88s.nvidia.com/", - RuleSets: []config.RuleSet{ - { - Name: "maxPercentageRule", - Taint: config.Taint{ - Key: "key1", - Value: "val1", - Effect: "NoSchedule", - }, - Cordon: config.Cordon{ShouldCordon: true}, - Priority: 10, - }, - }, - }, - K8sClient: &mockK8sClient{ - getNodesWithAnnotationFn: func(ctx context.Context, annotationKey string) ([]string, error) { - return []string{}, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelsMap map[string]string) error { - return nil - }, - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return map[string]string{}, nil - }, - }, - } - - // Test Case 1: Evaluator returns RetryAgainInFuture (no error) - t.Run("Evaluator returns RetryAgainInFuture (no error)", func(t *testing.T) { - r := NewReconciler(ctx, cfg, nil) - r.SetLabelKeys(cfg.TomlConfig.LabelPrefix) - - // Create evaluator that returns RuleEvaluationRetryAgainInFuture without error - ruleSetEval := &mockEvaluator{ - name: "RuleEvaluationRetryAgainInFuture", - ok: true, // ok=true likely means no error returned by mock - ruleEvalResult: common.RuleEvaluationRetryAgainInFuture, - } - - event := &protos.HealthEvent{ - NodeName: "node1", - } - - // Create a wrapper around the health event - healthEventWithStatus := &model.HealthEventWithStatus{ - HealthEvent: event, - } - - // Call handleEvent with the MaxPercentageRule evaluator - status, ruleEvalResult := r.handleEvent(ctx, healthEventWithStatus, []evaluator.RuleSetEvaluatorIface{ruleSetEval}, - rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{ - "RuleEvaluationRetryAgainInFuture": &cfg.TomlConfig.RuleSets[0].Taint, - }, - CordonConfigMap: map[string]bool{ - "RuleEvaluationRetryAgainInFuture": true, - }, - RuleSetPriorityMap: map[string]int{ - "RuleEvaluationRetryAgainInFuture": 10, - }, - }, - ) - - // When RuleEvaluationRetryAgainInFuture is returned, the node should NOT be quarantined immediately - if status != nil { - t.Errorf("Expected status to be nil when rule evaluation is RetryAgainInFuture, got %v", *status) - } - - // The ruleEvalResult should be RuleEvaluationRetryAgainInFuture - if ruleEvalResult != common.RuleEvaluationRetryAgainInFuture { - t.Errorf("Expected ruleEvalResult to be RuleEvaluationRetryAgainInFuture, got %v", ruleEvalResult) - } - - // Node should NOT be in quarantined map - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if quarantinedNodes["node1"] { - t.Errorf("Expected node NOT to be in quarantined map when rule evaluation is RetryAgainInFuture") - } - }) - - // Test Case 2: Evaluator returns RetryAgainInFuture (with error) - t.Run("Evaluator returns RetryAgainInFuture (with error)", func(t *testing.T) { - r := NewReconciler(ctx, cfg, nil) - r.SetLabelKeys(cfg.TomlConfig.LabelPrefix) - - // Create evaluator that returns RuleEvaluationRetryAgainInFuture with an error - ruleSetEval := &mockEvaluator{ - name: "RuleEvaluationRetryAgainInFuture", - ok: false, // ok=false likely means an error is returned by mock - ruleEvalResult: common.RuleEvaluationRetryAgainInFuture, - } - - event := &protos.HealthEvent{ - NodeName: "node1", - } - - // Create a wrapper around the health event - healthEventWithStatus := &model.HealthEventWithStatus{ - HealthEvent: event, - } - - // Call handleEvent with the MaxPercentageRule evaluator - status, ruleEvalResult := r.handleEvent(ctx, healthEventWithStatus, []evaluator.RuleSetEvaluatorIface{ruleSetEval}, - rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{ - "RuleEvaluationRetryAgainInFuture": &cfg.TomlConfig.RuleSets[0].Taint, - }, - CordonConfigMap: map[string]bool{ - "RuleEvaluationRetryAgainInFuture": true, - }, - RuleSetPriorityMap: map[string]int{ - "RuleEvaluationRetryAgainInFuture": 10, - }, - }, - ) - - // When RuleEvaluationRetryAgainInFuture is returned (even with error), the node should NOT be quarantined immediately - if status != nil { - t.Errorf("Expected status to be nil when rule evaluation is RetryAgainInFuture (with error), got %v", *status) - } - - // The ruleEvalResult should still be RuleEvaluationRetryAgainInFuture - if ruleEvalResult != common.RuleEvaluationRetryAgainInFuture { - t.Errorf("Expected ruleEvalResult to be RuleEvaluationRetryAgainInFuture (with error), got %v", ruleEvalResult) - } - - // Node should NOT be in quarantined map - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if quarantinedNodes["node1"] { - t.Errorf("Expected node NOT to be in quarantined map when rule evaluation is RetryAgainInFuture (with error)") - } - }) -} - -func TestHandleEventNodeAlreadyCordonedManually(t *testing.T) { - ctx := context.Background() - - tomlConfig := config.TomlConfig{ - LabelPrefix: "k88s.nvidia.com/", - RuleSets: []config.RuleSet{ - { - Name: "ruleset-1", - Taint: config.Taint{ - Key: "key1", - Value: "val1", - Effect: "NoSchedule", - }, - Cordon: config.Cordon{ShouldCordon: true}, - Priority: 1, - }, - }, - } - - // Track if the taint and annotation call was invoked - taintsSeen := []config.Taint{} - annotationsSeen := map[string]string{} - taintCalled := false - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // node is cordoned manually, no FQM annotation yet - return map[string]string{}, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, - taints []config.Taint, isCordon bool, - annotations map[string]string, labelsMap map[string]string) error { - - taintCalled = true - taintsSeen = append(taintsSeen, taints...) - - for k, v := range annotations { - annotationsSeen[k] = v - } - return nil - }, - } - - circuitBreakerConfig := CircuitBreakerConfig{ - Namespace: "nvsentinel", - Name: "fault-quarantine-circuit-breaker", - Percentage: 50, - Duration: 5 * time.Minute, - } - - cfg := ReconcilerConfig{ - TomlConfig: tomlConfig, - K8sClient: k8sMock, - CircuitBreaker: circuitBreakerConfig, - } - - r := NewReconciler(ctx, cfg, nil) - r.SetLabelKeys(cfg.TomlConfig.LabelPrefix) - - // Simulate that the node has been cordoned manually (unschedulable) but NOT by FQM - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, false) - - // Prepare the evaluator which will return success so taint should be applied - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{ - &mockEvaluator{name: "ruleset-1", ruleEvalResult: common.RuleEvaluationSuccess}, - } - - event := &protos.HealthEvent{NodeName: "node1"} - healthEventWithStatus := &model.HealthEventWithStatus{HealthEvent: event} - - status, _ := r.handleEvent(ctx, healthEventWithStatus, ruleSetEvals, - rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{ - "ruleset-1": &tomlConfig.RuleSets[0].Taint, - }, - CordonConfigMap: map[string]bool{ - "ruleset-1": true, - }, - RuleSetPriorityMap: map[string]int{ - "ruleset-1": 1, - }, - }, - ) - - // The reconciler should attempt to taint & annotate the node even though it was already cordoned manually - if !taintCalled { - t.Errorf("Expected TaintAndCordonNodeAndSetAnnotations to be called for already cordoned node") - } - - if status == nil { - t.Fatalf("Expected non-nil status returned from handleEvent") - } - - if *status != model.Quarantined { - t.Errorf("Expected status to be Quarantined, got %v", *status) - } - - if len(taintsSeen) == 0 { - t.Fatalf("expected at least one taint, got none") - } - if taintsSeen[0] != tomlConfig.RuleSets[0].Taint { - t.Errorf("Unexpected taint values: %+v", taintsSeen[0]) - } - - if _, ok := annotationsSeen[common.QuarantineHealthEventAnnotationKey]; !ok { - t.Errorf("expected %s annotation, but it wasn't passed to the client", - common.QuarantineHealthEventAnnotationKey) - } -} - -// TestHandleEventNodeAlreadyQuarantinedByFQMStillQuarantined verifies that when a node is already -// quarantined by FQM (i.e. has the quarantine annotation) and receives another *unhealthy* event, -// the reconciler skips further processing and keeps the node quarantined. -func TestHandleEventNodeAlreadyQuarantinedByFQMStillQuarantined(t *testing.T) { - ctx := context.Background() - - // Build an annotation payload representing the original quarantining event - originalEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - Version: 1, - // The original event that quarantined the node was unhealthy - IsHealthy: false, - } - - annotationMap := map[string]string{ - quarantineHealthEventAnnotationKey: func() string { b, _ := json.Marshal(originalEvent); return string(b) }(), - } - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationMap, nil - }, - // These functions should NOT be invoked because reconciler should early-return. - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - t.Fatalf("TaintAndCordonNodeAndSetAnnotations should not be called for already FQM-quarantined node (still unhealthy)") - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - t.Fatalf("UnTaintAndUnCordonNodeAndRemoveAnnotations should not be called when node remains quarantined") - return nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - for k, v := range annotations { - annotationMap[k] = v - } - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - // Mark node as cordoned/quarantined in the cache to satisfy nodeAlreadyCordoned check - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, false) - - // Initialize label keys so that handleQuarantinedNode may construct labels correctly if needed. - r.SetLabelKeys("k88s.nvidia.com/") - - // Incoming event is still unhealthy, hence node should stay quarantined - incomingEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - Version: 1, - IsHealthy: false, - } - - healthEventWithStatus := &model.HealthEventWithStatus{HealthEvent: incomingEvent} - - status, _ := r.handleEvent(ctx, healthEventWithStatus, nil, rulesetsConfig{}) - - if status == nil { - t.Fatalf("Expected non-nil status when node already quarantined by FQM") - } - if *status != model.AlreadyQuarantined { - t.Errorf("Expected status AlreadyQuarantined, got %v", *status) - } - - // The cache should still indicate the node is quarantined - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if !quarantinedNodes["node1"] { - t.Errorf("Expected node to remain quarantined in cache") - } -} - -// TestHandleEventNodeAlreadyQuarantinedByFQMUnquarantine verifies that when a node is already -// quarantined by FQM but receives the corresponding *healthy* event, the reconciler un-quarantines -// it and updates the status appropriately. -func TestHandleEventNodeAlreadyQuarantinedByFQMUnquarantine(t *testing.T) { - ctx := context.Background() - - // The annotation reflects the original unhealthy event that caused quarantine in new format - originalMap := healthEventsAnnotation.NewHealthEventsAnnotationMap() - originalEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - ComponentClass: "class1", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - originalMap.AddOrUpdateEvent(originalEvent) - - annotationMap := map[string]string{ - quarantineHealthEventAnnotationKey: func() string { b, _ := json.Marshal(originalMap); return string(b) }(), - quarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"key1","Value":"val1","Effect":"NoSchedule"}]`, - quarantineHealthEventIsCordonedAnnotationKey: "True", - } - - unquarantineCalled := false - var removedLabels []string - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationMap, nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - unquarantineCalled = true - if !isUncordon { - t.Errorf("Expected isUncordon to be true when un-quarantining the node") - } - removedLabels = labelsToRemove - return nil - }, - // No new tainting expected in this path - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - t.Fatalf("TaintAndCordonNodeAndSetAnnotations should not be called when node is being unquarantined") - return nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - for k, v := range annotations { - annotationMap[k] = v - } - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - // Mark node as currently quarantined - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, false) - r.SetLabelKeys("k88s.nvidia.com/") - - // Incoming *healthy* event that matches annotation ‑- should trigger un-quarantine - incomingEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - ComponentClass: "class1", - Version: 1, - IsHealthy: true, - EntitiesImpacted: []*protos.Entity{{ - EntityType: "GPU", - EntityValue: "0", - }}, - } - - healthEventWithStatus := &model.HealthEventWithStatus{HealthEvent: incomingEvent} - - status, _ := r.handleEvent(ctx, healthEventWithStatus, nil, rulesetsConfig{}) - - if status == nil { - t.Fatalf("Expected non-nil status when node already quarantined by FQM") - } - if *status != model.UnQuarantined { - t.Errorf("Expected status UnQuarantined after healthy event, got %v", *status) - } - - if !unquarantineCalled { - t.Errorf("Expected UnTaintAndUnCordonNodeAndRemoveAnnotations to be invoked for healthy event") - } - - expectedRemovedLabels := []string{ - cordonedByLabelKey, - cordonedReasonLabelKey, - cordonedTimestampLabelKey, - statemanager.NVSentinelStateLabelKey, - } - if !reflect.DeepEqual(removedLabels, expectedRemovedLabels) { - t.Errorf("Unexpected set of labels removed from UnTaintAndUnCordonNodeAndRemoveAnnotations: %v", removedLabels) - } - // The cache must reflect that the node is no longer quarantined - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if quarantinedNodes["node1"] { - t.Errorf("Expected node to be removed from quarantined cache after unquarantine") - } -} - -// Test cache consistency during quarantine and unquarantine operations -func TestCacheConsistencyDuringQuarantineUnquarantine(t *testing.T) { - ctx := context.Background() - - tomlConfig := config.TomlConfig{ - LabelPrefix: "k8s.nvidia.com/", - RuleSets: []config.RuleSet{ - { - Name: "ruleset-1", - Taint: config.Taint{ - Key: "key1", - Value: "val1", - Effect: "NoSchedule", - }, - Cordon: config.Cordon{ShouldCordon: true}, - Priority: 1, - }, - }, - } - - // Track API calls - apiCallCount := 0 - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - apiCallCount++ - // Return empty annotations initially - return map[string]string{}, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, - taints []config.Taint, isCordon bool, - annotations map[string]string, labelsMap map[string]string) error { - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, - taints []config.Taint, isUncordon bool, - annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - return nil - }, - getK8sClientFn: func() kubernetes.Interface { - // Return a fake client for buildNodeAnnotationsCache - return fake.NewSimpleClientset() - }, - } - - circuitBreakerConfig := CircuitBreakerConfig{ - Namespace: "nvsentinel", - Name: "fault-quarantine-circuit-breaker", - Percentage: 50, - Duration: 5 * time.Minute, - } - - cfg := ReconcilerConfig{ - TomlConfig: tomlConfig, - K8sClient: k8sMock, - CircuitBreaker: circuitBreakerConfig, - } - - // Create work signal for proper nodeInfo initialization - workSignal := make(chan struct{}, 10) - r := NewReconciler(ctx, cfg, workSignal) - r.SetLabelKeys(cfg.TomlConfig.LabelPrefix) - - // Build initial cache - err := r.buildNodeAnnotationsCache(ctx) - if err != nil { - t.Fatalf("Failed to build initial cache: %v", err) - } - - // Pre-populate cache for node1 with empty annotations to avoid API call - r.nodeAnnotationsCache.Store("node1", map[string]string{}) - - // Test 1: First event - should use cache (empty annotations) - event1 := &protos.HealthEvent{NodeName: "node1"} - healthEventWithStatus1 := &model.HealthEventWithStatus{HealthEvent: event1} - - // Mock evaluator that returns success - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{ - &mockEvaluator{name: "ruleset-1", ruleEvalResult: common.RuleEvaluationSuccess}, - } - - apiCallCount = 0 // Reset counter after initial cache build - status1, _ := r.handleEvent(ctx, healthEventWithStatus1, ruleSetEvals, rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{"ruleset-1": &tomlConfig.RuleSets[0].Taint}, - CordonConfigMap: map[string]bool{"ruleset-1": true}, - }) - - if apiCallCount != 0 { - t.Errorf("Expected 0 API calls (should use cache), got %d", apiCallCount) - } - - if status1 == nil || *status1 != model.Quarantined { - t.Errorf("Expected Quarantined status, got %v", status1) - } - - // Verify cache was updated with quarantine annotations - cached, ok := r.nodeAnnotationsCache.Load("node1") - if !ok { - t.Fatal("Expected node1 to be in cache after quarantine") - } - - cachedAnnotations := cached.(map[string]string) - if _, exists := cachedAnnotations[common.QuarantineHealthEventAnnotationKey]; !exists { - t.Error("Expected quarantine annotation in cache after quarantine operation") - } - - // Test 2: Second event on same node - should use updated cache - event2 := &protos.HealthEvent{ - NodeName: "node1", - IsHealthy: true, - Agent: event1.Agent, - CheckName: event1.CheckName, - ComponentClass: event1.ComponentClass, - Version: event1.Version, - } - healthEventWithStatus2 := &model.HealthEventWithStatus{HealthEvent: event2} - - // Update mock to return quarantine annotations if API is called (shouldn't be) - k8sMock.getNodeAnnotationsFn = func(ctx context.Context, nodeName string) (map[string]string, error) { - apiCallCount++ - t.Error("API should not be called - cache should be used") - return cachedAnnotations, nil - } - - status2, _ := r.handleEvent(ctx, healthEventWithStatus2, nil, rulesetsConfig{}) - - if apiCallCount != 0 { - t.Errorf("Expected 0 API calls for second event (should use cache), got %d", apiCallCount) - } - - if status2 == nil || *status2 != model.UnQuarantined { - t.Errorf("Expected UnQuarantined status, got %v", status2) - } - - // Verify cache was updated to remove quarantine annotations - cached2, ok2 := r.nodeAnnotationsCache.Load("node1") - if !ok2 { - t.Fatal("Expected node1 to still be in cache after unquarantine") - } - - cachedAnnotations2 := cached2.(map[string]string) - if len(cachedAnnotations2) != 0 { - t.Errorf("Expected empty annotations in cache after unquarantine, got %v", cachedAnnotations2) - } -} - -// Test cache fallback behavior when node not in cache -func TestCacheFallbackForUncachedNode(t *testing.T) { - ctx := context.Background() - - apiCallCount := 0 - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - apiCallCount++ - return map[string]string{ - common.QuarantineHealthEventAnnotationKey: "existing-event", - }, nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - - // Don't build initial cache - simulate a new node - annotations, err := r.getNodeQuarantineAnnotations(ctx, "new-node") - - if err != nil { - t.Fatalf("Expected successful API fallback, got error: %v", err) - } - - if apiCallCount != 1 { - t.Errorf("Expected 1 API call for uncached node, got %d", apiCallCount) - } - - if annotations[common.QuarantineHealthEventAnnotationKey] != "existing-event" { - t.Errorf("Expected quarantine annotation from API, got %v", annotations) - } - - // Verify node was added to cache - cached, ok := r.nodeAnnotationsCache.Load("new-node") - if !ok { - t.Fatal("Expected new-node to be cached after API call") - } - - cachedAnnotations := cached.(map[string]string) - if cachedAnnotations[common.QuarantineHealthEventAnnotationKey] != "existing-event" { - t.Error("Expected API result to be cached") - } - - // Second call should use cache - apiCallCount = 0 - annotations2, err2 := r.getNodeQuarantineAnnotations(ctx, "new-node") - - if err2 != nil { - t.Fatalf("Expected successful cache hit, got error: %v", err2) - } - - if apiCallCount != 0 { - t.Errorf("Expected 0 API calls (should use cache), got %d", apiCallCount) - } - - if annotations2[common.QuarantineHealthEventAnnotationKey] != "existing-event" { - t.Errorf("Expected cached annotation, got %v", annotations2) - } -} - -// Test manual uncordon scenario with cache -func TestManualUncordonWithCache(t *testing.T) { - ctx := context.Background() - - originalEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - ComponentClass: "class1", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - // Convert to array format as healthEventsAnnotationMap expects an array - annotationPayload, _ := json.Marshal([]*protos.HealthEvent{originalEvent}) - - apiCallCount := 0 - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - apiCallCount++ - // Should not be called if cache is working - t.Error("API should not be called when cache has the data") - return map[string]string{ - quarantineHealthEventAnnotationKey: string(annotationPayload), - }, nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, - taints []config.Taint, isUncordon bool, - annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - r.SetLabelKeys("k8s.nvidia.com/") - - // Pre-populate cache with quarantine annotations (simulating node was quarantined) - r.nodeAnnotationsCache.Store("node1", map[string]string{ - quarantineHealthEventAnnotationKey: string(annotationPayload), - quarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"key1","Value":"val1","Effect":"NoSchedule"}]`, - quarantineHealthEventIsCordonedAnnotationKey: "True", - }) - - // Simulate manual uncordon by updating nodeInfo - // (In reality, this would be done by the node informer) - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", false, true) - - // Send healthy event that matches the quarantine - healthyEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - ComponentClass: "class1", - Version: 1, - IsHealthy: true, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - healthEventWithStatus := &model.HealthEventWithStatus{HealthEvent: healthyEvent} - - apiCallCount = 0 // Reset counter - status, _ := r.handleEvent(ctx, healthEventWithStatus, nil, rulesetsConfig{}) - - if apiCallCount != 0 { - t.Errorf("Expected 0 API calls (should use cache), got %d", apiCallCount) - } - - if status == nil || *status != model.UnQuarantined { - t.Errorf("Expected UnQuarantined status after manual uncordon + healthy event, got %v", status) - } - - // Verify cache was updated to remove annotations - cached, ok := r.nodeAnnotationsCache.Load("node1") - if !ok { - t.Fatal("Expected node1 to still be in cache") - } - - cachedAnnotations := cached.(map[string]string) - if len(cachedAnnotations) != 0 { - t.Errorf("Expected empty annotations in cache after cleanup, got %v", cachedAnnotations) - } -} - -// Test concurrent cache access -func TestConcurrentCacheAccess(t *testing.T) { - ctx := context.Background() - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // Simulate some delay - time.Sleep(10 * time.Millisecond) - return map[string]string{ - common.QuarantineHealthEventAnnotationKey: nodeName + "-event", - }, nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - - // Pre-populate cache with some nodes - for i := 0; i < 10; i++ { - nodeName := fmt.Sprintf("node%d", i) - r.nodeAnnotationsCache.Store(nodeName, map[string]string{ - common.QuarantineHealthEventAnnotationKey: nodeName + "-cached", - }) - } - - var wg sync.WaitGroup - errors := make(chan error, 100) - - // Concurrent readers - for i := 0; i < 50; i++ { - wg.Add(1) - go func(nodeNum int) { - defer wg.Done() - nodeName := fmt.Sprintf("node%d", nodeNum%10) - annotations, err := r.getNodeQuarantineAnnotations(ctx, nodeName) - if err != nil { - errors <- fmt.Errorf("reader error for %s: %v", nodeName, err) - return - } - // Check if we got the expected value (might be updated by concurrent writers) - actual := annotations[common.QuarantineHealthEventAnnotationKey] - // Accept either the original cached value or an updated value from concurrent writers - if actual != nodeName+"-cached" && !strings.HasPrefix(actual, nodeName+"-updated-") { - errors <- fmt.Errorf("expected %s-cached or %s-updated-*, got %s", - nodeName, nodeName, actual) - } - }(i) - } - - // Concurrent writers (updating cache) - for i := 0; i < 20; i++ { - wg.Add(1) - go func(nodeNum int) { - defer wg.Done() - nodeName := fmt.Sprintf("node%d", nodeNum%10) - newAnnotations := map[string]string{ - common.QuarantineHealthEventAnnotationKey: nodeName + "-updated-" + fmt.Sprintf("%d", nodeNum), - } - r.updateCacheWithQuarantineAnnotations(nodeName, newAnnotations) - }(i) - } - - // Concurrent deleters - only delete from specific nodes to avoid conflicts - for i := 20; i < 25; i++ { - wg.Add(1) - go func(nodeNum int) { - defer wg.Done() - nodeName := fmt.Sprintf("node%d", nodeNum) - // Pre-populate these nodes for deletion - r.nodeAnnotationsCache.Store(nodeName, map[string]string{ - common.QuarantineHealthEventAnnotationKey: nodeName + "-to-delete", - }) - // Then delete the annotation - r.updateCacheWithUnquarantineAnnotations(nodeName, - []string{common.QuarantineHealthEventAnnotationKey}) - }(i) - } - - wg.Wait() - close(errors) - - // Check for any errors - var errorCount int - for err := range errors { - t.Errorf("Concurrent access error: %v", err) - errorCount++ - } - - if errorCount > 0 { - t.Fatalf("Had %d errors during concurrent access", errorCount) - } -} - -// Test cache behavior when annotations change externally -func TestCacheUpdateFromNodeInformer(t *testing.T) { - ctx := context.Background() - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // Should not be called if cache is properly updated by informer - t.Error("API should not be called when cache is updated by informer") - return nil, nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - - // Simulate node informer callback with new annotations - newAnnotations := map[string]string{ - common.QuarantineHealthEventAnnotationKey: "external-event", - common.QuarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"external","Value":"taint"}]`, - } - r.handleNodeAnnotationChange("node1", newAnnotations) - - // Try to get annotations - should come from cache - annotations, err := r.getNodeQuarantineAnnotations(ctx, "node1") - if err != nil { - t.Fatalf("Expected successful cache hit, got error: %v", err) - } - - if annotations[common.QuarantineHealthEventAnnotationKey] != "external-event" { - t.Errorf("Expected externally updated annotation, got %v", annotations) - } - - // Simulate node deletion - r.handleNodeAnnotationChange("node1", nil) - - // Verify node was removed from cache - _, ok := r.nodeAnnotationsCache.Load("node1") - if ok { - t.Error("Expected node1 to be removed from cache after deletion") - } -} - -// Test buildNodeAnnotationsCache with various node states -func TestBuildNodeAnnotationsCacheWithVariousStates(t *testing.T) { - ctx := context.Background() - - // Create fake k8s client with various nodes - node1 := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Annotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event1", - "other-annotation": "ignored", - }, - }, - } - - node2 := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - Annotations: map[string]string{ - common.QuarantineHealthEventIsCordonedAnnotationKey: "True", - }, - }, - } - - node3 := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node3", - Annotations: map[string]string{}, // No quarantine annotations - }, - } - - node4 := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node4", - // No annotations at all - }, - } - - fakeClient := fake.NewSimpleClientset(node1, node2, node3, node4) - - k8sMock := &mockK8sClient{ - getK8sClientFn: func() kubernetes.Interface { - return fakeClient - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - - err := r.buildNodeAnnotationsCache(ctx) - if err != nil { - t.Fatalf("Failed to build cache: %v", err) - } - - // Verify all nodes are in cache - tests := []struct { - nodeName string - expectedAnns map[string]string - expectedInCache bool - }{ - { - nodeName: "node1", - expectedAnns: map[string]string{ - common.QuarantineHealthEventAnnotationKey: "event1", - }, - expectedInCache: true, - }, - { - nodeName: "node2", - expectedAnns: map[string]string{ - common.QuarantineHealthEventIsCordonedAnnotationKey: "True", - }, - expectedInCache: true, - }, - { - nodeName: "node3", - expectedAnns: map[string]string{}, - expectedInCache: true, - }, - { - nodeName: "node4", - expectedAnns: map[string]string{}, - expectedInCache: true, - }, - } - - for _, tt := range tests { - cached, ok := r.nodeAnnotationsCache.Load(tt.nodeName) - if ok != tt.expectedInCache { - t.Errorf("Node %s: expected in cache = %v, got %v", tt.nodeName, tt.expectedInCache, ok) - continue - } - - if !ok { - continue - } - - cachedAnns := cached.(map[string]string) - if len(cachedAnns) != len(tt.expectedAnns) { - t.Errorf("Node %s: expected %d annotations, got %d", tt.nodeName, - len(tt.expectedAnns), len(cachedAnns)) - } - - for key, expectedVal := range tt.expectedAnns { - if cachedAnns[key] != expectedVal { - t.Errorf("Node %s: expected annotation %s=%s, got %s", - tt.nodeName, key, expectedVal, cachedAnns[key]) - } - } - - // Verify non-quarantine annotations are not cached - if _, exists := cachedAnns["other-annotation"]; exists { - t.Errorf("Node %s: non-quarantine annotation should not be cached", tt.nodeName) - } - } -} - -// Test cache performance with many events -func TestCachePerformanceWithManyEvents(t *testing.T) { - ctx := context.Background() - - apiCallCount := 0 - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - apiCallCount++ - // Simulate API latency - time.Sleep(5 * time.Millisecond) - return map[string]string{}, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, - taints []config.Taint, isCordon bool, - annotations map[string]string, labelsMap map[string]string) error { - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - - // Pre-populate cache with 100 nodes - for i := 0; i < 100; i++ { - nodeName := fmt.Sprintf("node%d", i) - r.nodeAnnotationsCache.Store(nodeName, map[string]string{}) - } - - // Process 1000 events across 100 nodes - startTime := time.Now() - for i := 0; i < 1000; i++ { - nodeName := fmt.Sprintf("node%d", i%100) - event := &protos.HealthEvent{NodeName: nodeName} - healthEventWithStatus := &model.HealthEventWithStatus{HealthEvent: event} - - // Mock evaluator that returns not applicable - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{ - &mockEvaluator{name: "ruleset-1", ruleEvalResult: common.RuleEvaluationNotApplicable}, - } - - r.handleEvent(ctx, healthEventWithStatus, ruleSetEvals, rulesetsConfig{}) - } - duration := time.Since(startTime) - - if apiCallCount > 0 { - t.Errorf("Expected 0 API calls with cache, got %d", apiCallCount) - } - - if duration > 200*time.Millisecond { - t.Errorf("Processing 1000 events took too long: %v (expected < 200ms)", duration) - } - - t.Logf("Processed 1000 events in %v with cache (0 API calls)", duration) -} - -// Test that mutations to returned maps don't affect the cache -func TestCacheReturnsCopyNotReference(t *testing.T) { - ctx := context.Background() - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return map[string]string{ - common.QuarantineHealthEventAnnotationKey: "original-value", - "other-annotation": "should-be-ignored", - }, nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - - // Test 1: fetchAndCacheQuarantineAnnotations returns a copy - annotations1, err := r.fetchAndCacheQuarantineAnnotations(ctx, "test-node") - if err != nil { - t.Fatalf("Expected successful fetch, got error: %v", err) - } - - // Mutate the returned map - annotations1[common.QuarantineHealthEventAnnotationKey] = "mutated-value" - annotations1["new-key"] = "new-value" - - // Get from cache again (should use cached version) - annotations2, err := r.getNodeQuarantineAnnotations(ctx, "test-node") - if err != nil { - t.Fatalf("Expected successful cache hit, got error: %v", err) - } - - // Verify the cached value wasn't mutated - if annotations2[common.QuarantineHealthEventAnnotationKey] != "original-value" { - t.Errorf("Cache was mutated! Expected 'original-value', got '%s'", - annotations2[common.QuarantineHealthEventAnnotationKey]) - } - - if _, exists := annotations2["new-key"]; exists { - t.Error("Cache was mutated! Unexpected key 'new-key' found in cache") - } - - // Test 2: getNodeQuarantineAnnotations also returns a copy - annotations3, err := r.getNodeQuarantineAnnotations(ctx, "test-node") - if err != nil { - t.Fatalf("Expected successful cache hit, got error: %v", err) - } - - // Mutate this map too - annotations3[common.QuarantineHealthEventAnnotationKey] = "another-mutation" - - // Get from cache once more - annotations4, err := r.getNodeQuarantineAnnotations(ctx, "test-node") - if err != nil { - t.Fatalf("Expected successful cache hit, got error: %v", err) - } - - // Verify the cached value still wasn't mutated - if annotations4[common.QuarantineHealthEventAnnotationKey] != "original-value" { - t.Errorf("Cache was mutated! Expected 'original-value', got '%s'", - annotations4[common.QuarantineHealthEventAnnotationKey]) - } -} - -// TestHandleManualUncordon tests the manual uncordon handler -func TestHandleManualUncordon(t *testing.T) { - ctx := context.Background() - - originalEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "agent1", - CheckName: "checkA", - ComponentClass: "class1", - Version: 1, - IsHealthy: false, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - - // Simulate FQ annotations on the node - existingAnnotations := map[string]string{ - common.QuarantineHealthEventAnnotationKey: func() string { b, _ := json.Marshal(originalEvent); return string(b) }(), - common.QuarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"key1","Value":"val1","Effect":"NoSchedule"}]`, - common.QuarantineHealthEventIsCordonedAnnotationKey: common.QuarantineHealthEventIsCordonedAnnotationValueTrue, - } - - removedAnnotationKeys := []string{} - addedAnnotations := map[string]string{} - var removedTaints []config.Taint - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - if nodeName != "node1" { - t.Errorf("Expected node1, got %s", nodeName) - } - return existingAnnotations, nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - if nodeName != "node1" { - t.Errorf("Expected node1, got %s", nodeName) - } - if isUncordon { - t.Errorf("Should not try to uncordon again - node is already manually uncordoned") - } - removedAnnotationKeys = annotationKeys - removedTaints = taints - - expectedLabelsToRemove := []string{statemanager.NVSentinelStateLabelKey} - if !slices.Equal(expectedLabelsToRemove, labelsToRemove) { - t.Errorf("Should remove labels %v, got %v", expectedLabelsToRemove, labelsToRemove) - } - if len(labelMap) > 0 { - t.Errorf("Should not add any labels, got %v", labelMap) - } - - return nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - if nodeName != "node1" { - t.Errorf("Expected node1, got %s", nodeName) - } - if isCordon { - t.Errorf("Should not cordon the node") - } - if len(taints) > 0 { - t.Errorf("Should not add any taints") - } - addedAnnotations = annotations - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - r.SetLabelKeys("k8s.nvidia.com/") - - // Initialize nodeInfo and mark node as quarantined - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, true) - - // Call handleManualUncordon - err := r.handleManualUncordon("node1") - if err != nil { - t.Fatalf("handleManualUncordon failed: %v", err) - } - - // Verify annotations were removed - expectedRemovedAnnotations := []string{ - common.QuarantineHealthEventAnnotationKey, - common.QuarantineHealthEventAppliedTaintsAnnotationKey, - common.QuarantineHealthEventIsCordonedAnnotationKey, - } - if len(removedAnnotationKeys) != len(expectedRemovedAnnotations) { - t.Errorf("Expected %d annotations to be removed, got %d", len(expectedRemovedAnnotations), len(removedAnnotationKeys)) - } - for _, key := range expectedRemovedAnnotations { - found := false - for _, removedKey := range removedAnnotationKeys { - if removedKey == key { - found = true - break - } - } - if !found { - t.Errorf("Expected annotation %s to be removed", key) - } - } - - // Verify taints were removed - if len(removedTaints) != 1 { - t.Errorf("Expected 1 taint to be removed, got %d", len(removedTaints)) - } else { - if removedTaints[0].Key != "key1" || removedTaints[0].Value != "val1" || removedTaints[0].Effect != "NoSchedule" { - t.Errorf("Unexpected taint removed: %+v", removedTaints[0]) - } - } - - // Verify manual uncordon annotation was added - if addedAnnotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey] != common.QuarantinedNodeUncordonedManuallyAnnotationValue { - t.Errorf("Expected manual uncordon annotation to be added, got %v", addedAnnotations) - } - - // Verify node is no longer marked as quarantined in nodeInfo cache - // We update this immediately for consistency with the metric - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if quarantinedNodes["node1"] { - t.Errorf("Expected node1 to be removed from quarantined nodes cache") - } - - // Note: We don't verify the annotation cache here because it will be updated - // by onNodeAnnotationsChanged when the subsequent update event is processed -} - -// TestManualUncordonEndToEnd tests the complete flow of manual uncordon from detection to state updates -func TestManualUncordonEndToEnd(t *testing.T) { - ctx := context.Background() - - // Create test nodes - quarantinedNode := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-node-1", - Labels: map[string]string{ - informer.GpuNodeLabel: "true", - statemanager.NVSentinelStateLabelKey: string(statemanager.RemediationFailedLabelValue), - }, - Annotations: map[string]string{ - common.QuarantineHealthEventAnnotationKey: `{"nodeName":"test-node-1","agent":"test","checkName":"test","isHealthy":false}`, - common.QuarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"fault","Value":"gpu","Effect":"NoSchedule"}]`, - common.QuarantineHealthEventIsCordonedAnnotationKey: common.QuarantineHealthEventIsCordonedAnnotationValueTrue, - }, - }, - Spec: corev1.NodeSpec{ - Unschedulable: true, // Node is cordoned - }, - } - - normalNode := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-node-2", - Labels: map[string]string{ - informer.GpuNodeLabel: "true", - }, - }, - Spec: corev1.NodeSpec{ - Unschedulable: false, - }, - } - - // Create fake k8s client with initial nodes - fakeClient := fake.NewSimpleClientset(quarantinedNode, normalNode) - - // Track API calls with mutex protection for concurrent access - var mu sync.Mutex - updateCount := 0 - - // Wrap the client to intercept updates - fakeClient.PrependReactor("update", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - mu.Lock() - updateCount++ - mu.Unlock() - return false, nil, nil // Let it proceed - }) - - // Create reconciler with mock K8s client - mockK8sClient := &mockK8sClient{ - getK8sClientFn: func() kubernetes.Interface { - return fakeClient - }, - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - node, err := fakeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if err != nil { - return nil, err - } - return node.Annotations, nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - // Simulate removing annotations and taints - node, err := fakeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if err != nil { - return err - } - - // Remove specified annotations - for _, key := range annotationKeys { - delete(node.Annotations, key) - } - - // Remove specified taints - var newTaints []corev1.Taint - for _, existingTaint := range node.Spec.Taints { - shouldRemove := false - for _, taintToRemove := range taints { - if existingTaint.Key == taintToRemove.Key && - existingTaint.Value == taintToRemove.Value && - string(existingTaint.Effect) == taintToRemove.Effect { - shouldRemove = true - break - } - } - if !shouldRemove { - newTaints = append(newTaints, existingTaint) - } - } - node.Spec.Taints = newTaints - - delete(node.Labels, statemanager.NVSentinelStateLabelKey) - - _, err = fakeClient.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) - return err - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - // Simulate adding annotations - node, err := fakeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if err != nil { - return err - } - - if node.Annotations == nil { - node.Annotations = make(map[string]string) - } - for k, v := range annotations { - node.Annotations[k] = v - } - - _, err = fakeClient.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) - return err - }, - } - - // Create reconciler - workSignal := make(chan struct{}, 10) - r := NewReconciler(ctx, ReconcilerConfig{ - K8sClient: mockK8sClient, - DryRun: false, - }, workSignal) - r.SetLabelKeys("k8s.nvidia.com/") - - // Create NodeInformer - nodeInformer, err := informer.NewNodeInformer(fakeClient, 0, workSignal, r.nodeInfo) - if err != nil { - t.Fatalf("Failed to create NodeInformer: %v", err) - } - - // Track callback invocations with thread-safe access - manualUncordonCalled := false - manualUncordonNodeName := "" - annotationChanges := make(map[string]int) // Track how many times each node's annotations changed - - // Set up callbacks - nodeInformer.SetOnManualUncordonCallback(func(nodeName string) error { - mu.Lock() - manualUncordonCalled = true - manualUncordonNodeName = nodeName - mu.Unlock() - return r.handleManualUncordon(nodeName) - }) - - nodeInformer.SetOnNodeAnnotationsChangedCallback(func(nodeName string, annotations map[string]string) { - mu.Lock() - annotationChanges[nodeName]++ - mu.Unlock() - r.handleNodeAnnotationChange(nodeName, annotations) - }) - - // Start the informer - stopCh := make(chan struct{}) - defer close(stopCh) - - go nodeInformer.Run(stopCh) - - // Wait for initial sync - if !cache.WaitForCacheSync(stopCh, nodeInformer.HasSynced) { - t.Fatalf("Failed to sync cache") - } - - // Initial state verification - totalGpu, cordonedMap, err := nodeInformer.GetGpuNodeCounts() - if err != nil { - t.Fatalf("Failed to get initial counts: %v", err) - } - if totalGpu != 2 { - t.Errorf("Expected 2 GPU nodes, got %d", totalGpu) - } - if len(cordonedMap) != 1 || !cordonedMap["test-node-1"] { - t.Errorf("Expected test-node-1 to be cordoned, got %v", cordonedMap) - } - - // Simulate manual uncordon by updating the node - quarantinedNode.Spec.Unschedulable = false - _, err = fakeClient.CoreV1().Nodes().Update(ctx, quarantinedNode, metav1.UpdateOptions{}) - if err != nil { - t.Fatalf("Failed to simulate manual uncordon: %v", err) - } - - // Wait for the informer to process the update event - timeout := time.After(2 * time.Second) - ticker := time.NewTicker(10 * time.Millisecond) - defer ticker.Stop() - - manualUncordonDetected := false - for !manualUncordonDetected { - select { - case <-timeout: - t.Fatal("Timeout waiting for manual uncordon to be detected") - case <-ticker.C: - mu.Lock() - manualUncordonDetected = manualUncordonCalled - mu.Unlock() - } - } - - // Verify manual uncordon was detected and handled - mu.Lock() - if !manualUncordonCalled { - t.Error("Manual uncordon callback was not called") - } - if manualUncordonNodeName != "test-node-1" { - t.Errorf("Expected manual uncordon for test-node-1, got %s", manualUncordonNodeName) - } - mu.Unlock() - - // Wait for the node to be updated by the reconciler - nodeUpdated := false - timeout = time.After(2 * time.Second) - ticker = time.NewTicker(10 * time.Millisecond) - defer ticker.Stop() - - for !nodeUpdated { - select { - case <-timeout: - t.Fatal("Timeout waiting for node to be updated with manual uncordon annotation") - case <-ticker.C: - node, err := fakeClient.CoreV1().Nodes().Get(ctx, "test-node-1", metav1.GetOptions{}) - if err == nil { - if val := node.Annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey]; val == common.QuarantinedNodeUncordonedManuallyAnnotationValue { - nodeUpdated = true - } - } - } - } - - // Verify the node was updated with manual uncordon annotation - updatedNode, err := fakeClient.CoreV1().Nodes().Get(ctx, "test-node-1", metav1.GetOptions{}) - if err != nil { - t.Fatalf("Failed to get updated node: %v", err) - } - - // Check FQ annotations were removed - if _, exists := updatedNode.Annotations[common.QuarantineHealthEventAnnotationKey]; exists { - t.Error("QuarantineHealthEvent annotation should be removed") - } - if _, exists := updatedNode.Annotations[common.QuarantineHealthEventAppliedTaintsAnnotationKey]; exists { - t.Error("QuarantineHealthEventAppliedTaints annotation should be removed") - } - if _, exists := updatedNode.Annotations[common.QuarantineHealthEventIsCordonedAnnotationKey]; exists { - t.Error("QuarantineHealthEventIsCordoned annotation should be removed") - } - - // Check dgxc.nvidia.com/nvsentinel-state label was removed but expected labels are persisted - if _, exists := updatedNode.Labels[statemanager.NVSentinelStateLabelKey]; exists { - t.Errorf("%s label should be removed", statemanager.NVSentinelStateLabelKey) - } - if _, exists := updatedNode.Labels[informer.GpuNodeLabel]; !exists { - t.Errorf("%s label should be persisted", informer.GpuNodeLabel) - } - - // Check manual uncordon annotation was added - if val := updatedNode.Annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey]; val != common.QuarantinedNodeUncordonedManuallyAnnotationValue { - t.Errorf("Expected manual uncordon annotation to be 'True', got %s", val) - } - - // Verify final state - totalGpu, cordonedMap, err = nodeInformer.GetGpuNodeCounts() - if err != nil { - t.Fatalf("Failed to get final counts: %v", err) - } - if totalGpu != 2 { - t.Errorf("Expected 2 GPU nodes, got %d", totalGpu) - } - if len(cordonedMap) != 0 { - t.Errorf("Expected no cordoned nodes after manual uncordon, got %v", cordonedMap) - } - - // Verify nodeInfo cache is consistent - quarantinedNodes := r.nodeInfo.GetQuarantinedNodesCopy() - if quarantinedNodes["test-node-1"] { - t.Error("test-node-1 should not be in quarantined nodes cache") - } - - // Verify annotation change callbacks were invoked - mu.Lock() - if annotationChanges["test-node-1"] < 1 { - t.Error("Expected annotation change callback to be invoked for test-node-1") - } - - // Verify at least 2 updates occurred (one for removing annotations, one for adding manual uncordon annotation) - if updateCount < 2 { - t.Errorf("Expected at least 2 node updates, got %d", updateCount) - } - mu.Unlock() -} - -// TestNodeRequarantineAfterManualUncordon tests that when FQ quarantines a node again after manual uncordon, -// the manual uncordon annotation is removed -func TestNodeRequarantineAfterManualUncordon(t *testing.T) { - ctx := context.Background() - - // Track annotations that were removed - var removedAnnotations []string - hasManualUncordonAnnotation := true - - // Create mock K8s client - mockK8sClient := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // Simulate node with manual uncordon annotation - if hasManualUncordonAnnotation { - return map[string]string{ - common.QuarantinedNodeUncordonedManuallyAnnotationKey: common.QuarantinedNodeUncordonedManuallyAnnotationValue, - common.QuarantineHealthEventAnnotationKey: "old-event", - }, nil - } - return map[string]string{ - common.QuarantineHealthEventAnnotationKey: "old-event", - }, nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - // Track which annotations were removed - removedAnnotations = append(removedAnnotations, annotationKeys...) - // Simulate removal - for _, key := range annotationKeys { - if key == common.QuarantinedNodeUncordonedManuallyAnnotationKey { - hasManualUncordonAnnotation = false - } - } - return nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - return nil - }, - getK8sClientFn: func() kubernetes.Interface { - return fake.NewSimpleClientset() - }, - } - - // Create reconciler - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: mockK8sClient}, nil) - r.SetLabelKeys("k8s.nvidia.com/") - - // Build cache with the node having manual uncordon annotation - r.nodeAnnotationsCache.Store("test-node", map[string]string{ - common.QuarantinedNodeUncordonedManuallyAnnotationKey: common.QuarantinedNodeUncordonedManuallyAnnotationValue, - common.QuarantineHealthEventAnnotationKey: "old-event", - }) - - // Simulate quarantine action that should remove manual uncordon annotation - // This would normally be called from applyRule, but we'll test the specific part - nodeAnnotations, err := r.getNodeQuarantineAnnotations(ctx, "test-node") - if err != nil { - t.Fatalf("Failed to get node annotations: %v", err) - } - - if _, hasManualUncordon := nodeAnnotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey]; hasManualUncordon { - // Remove the manual uncordon annotation before applying quarantine - if err := mockK8sClient.UnTaintAndUnCordonNodeAndRemoveAnnotations( - ctx, - "test-node", - nil, // No taints to remove - false, // Not uncordoning - []string{common.QuarantinedNodeUncordonedManuallyAnnotationKey}, // Remove manual uncordon annotation - nil, // No labels to remove - nil, // No labels to add - ); err == nil { - // Update cache to remove the manual uncordon annotation - r.updateCacheWithUnquarantineAnnotations("test-node", - []string{common.QuarantinedNodeUncordonedManuallyAnnotationKey}) - } - } - - // Verify manual uncordon annotation was removed - found := false - for _, annotation := range removedAnnotations { - if annotation == common.QuarantinedNodeUncordonedManuallyAnnotationKey { - found = true - break - } - } - - if !found { - t.Errorf("Expected manual uncordon annotation to be removed when FQ quarantines node again") - } - - // Verify cache was updated - cachedAnnotations, _ := r.getNodeQuarantineAnnotations(ctx, "test-node") - if _, stillHasAnnotation := cachedAnnotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey]; stillHasAnnotation { - t.Errorf("Cache should not contain manual uncordon annotation after removal") - } -} - -// TestCircuitBreakerBasicFunctionality tests basic circuit breaker operations -func TestCircuitBreakerBasicFunctionality(t *testing.T) { - ctx := context.Background() - - // Mock circuit breaker state storage - breakerState := "CLOSED" - cbStateReadCount := 0 - cbStateWriteCount := 0 - - mockK8sClient := &mockK8sClient{ - ensureConfigMapFn: func(ctx context.Context, name, namespace string, initialStatus string) error { - if initialStatus != "CLOSED" { - t.Errorf("Expected initial state to be CLOSED, got %s", initialStatus) - } - return nil - }, - readCBStateFn: func(ctx context.Context, name, namespace string) (string, error) { - cbStateReadCount++ - return breakerState, nil - }, - writeCBStateFn: func(ctx context.Context, name, namespace, status string) error { - cbStateWriteCount++ - breakerState = status - return nil - }, - getTotalGpuNodesFn: func(ctx context.Context) (int, error) { - return 10, nil // 10 total nodes for testing - }, - } - - circuitBreakerConfig := CircuitBreakerConfig{ - Namespace: "test-namespace", - Name: "test-circuit-breaker", - Percentage: 50, // 50% threshold - Duration: 5 * time.Minute, - } - - cfg := ReconcilerConfig{ - K8sClient: mockK8sClient, - CircuitBreakerEnabled: true, - CircuitBreaker: circuitBreakerConfig, - } - - r := NewReconciler(ctx, cfg, nil) - - // Test 1: Circuit breaker should be initialized in CLOSED state - if r.cb == nil { - t.Fatal("Circuit breaker should be initialized when enabled") - } - - currentState := r.cb.CurrentState() - if currentState != "CLOSED" { - t.Errorf("Expected initial state to be CLOSED, got %s", currentState) - } - - // Test 2: Circuit breaker should not be tripped initially - isTripped, err := r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - if isTripped { - t.Error("Circuit breaker should not be tripped initially") - } - - // Test 3: Add cordon events below threshold (4 out of 10 nodes = 40% < 50%) - for i := 0; i < 4; i++ { - r.cb.AddCordonEvent(fmt.Sprintf("node-%d", i)) - } - - isTripped, err = r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - if isTripped { - t.Error("Circuit breaker should not be tripped at 40% threshold") - } - - // Test 4: Add one more cordon event to reach threshold (5 out of 10 nodes = 50%) - r.cb.AddCordonEvent("node-4") - - isTripped, err = r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - if !isTripped { - t.Error("Circuit breaker should be tripped at 50% threshold") - } - - // Test 5: Verify state was persisted - if cbStateWriteCount == 0 { - t.Error("Circuit breaker state should have been written to ConfigMap") - } - if breakerState != "TRIPPED" { - t.Errorf("Expected persisted state to be TRIPPED, got %s", breakerState) - } - - // Test 6: Force state back to CLOSED - err = r.cb.ForceState(ctx, "CLOSED") - if err != nil { - t.Fatalf("Error forcing circuit breaker state: %v", err) - } - - currentState = r.cb.CurrentState() - if currentState != "CLOSED" { - t.Errorf("Expected state to be CLOSED after forcing, got %s", currentState) - } - - // Note: IsTripped() will automatically trip the breaker again if the threshold is still exceeded - // So we just verify the current state is CLOSED, but IsTripped() will return true due to the - // cordon events still being in the sliding window - isTripped, err = r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - // The breaker will be tripped again because the cordon events are still within the sliding window - if !isTripped { - t.Error("Circuit breaker should be tripped again due to cordon events still in sliding window") - } -} - -// TestCircuitBreakerSlidingWindow tests sliding window behavior -func TestCircuitBreakerSlidingWindow(t *testing.T) { - ctx := context.Background() - - mockK8sClient := &mockK8sClient{ - ensureConfigMapFn: func(ctx context.Context, name, namespace string, initialStatus string) error { - return nil - }, - readCBStateFn: func(ctx context.Context, name, namespace string) (string, error) { - return "CLOSED", nil - }, - writeCBStateFn: func(ctx context.Context, name, namespace, status string) error { - return nil - }, - getTotalGpuNodesFn: func(ctx context.Context) (int, error) { - return 10, nil // 10 total nodes - }, - } - - circuitBreakerConfig := CircuitBreakerConfig{ - Namespace: "test-namespace", - Name: "test-circuit-breaker", - Percentage: 50, // 50% threshold - Duration: 2 * time.Second, // Short window for testing - } - - cfg := ReconcilerConfig{ - K8sClient: mockK8sClient, - CircuitBreakerEnabled: true, - CircuitBreaker: circuitBreakerConfig, - } - - r := NewReconciler(ctx, cfg, nil) - - // Add cordon events to reach threshold - for i := 0; i < 5; i++ { - r.cb.AddCordonEvent(fmt.Sprintf("node-%d", i)) - } - - // Should be tripped immediately - isTripped, err := r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - if !isTripped { - t.Error("Circuit breaker should be tripped after reaching threshold") - } - - // Force state back to CLOSED to test sliding window - err = r.cb.ForceState(ctx, "CLOSED") - if err != nil { - t.Fatalf("Error forcing circuit breaker state: %v", err) - } - - // Wait for sliding window to expire (events should age out) - time.Sleep(3 * time.Second) - - // Should not be tripped anymore due to sliding window - isTripped, err = r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - if isTripped { - t.Error("Circuit breaker should not be tripped after sliding window expires") - } -} - -// TestCircuitBreakerUniqueNodeTracking tests that duplicate cordon events for same node are handled correctly -func TestCircuitBreakerUniqueNodeTracking(t *testing.T) { - ctx := context.Background() - - mockK8sClient := &mockK8sClient{ - ensureConfigMapFn: func(ctx context.Context, name, namespace string, initialStatus string) error { - return nil - }, - readCBStateFn: func(ctx context.Context, name, namespace string) (string, error) { - return "CLOSED", nil - }, - writeCBStateFn: func(ctx context.Context, name, namespace, status string) error { - return nil - }, - getTotalGpuNodesFn: func(ctx context.Context) (int, error) { - return 10, nil // 10 total nodes - }, - } - - circuitBreakerConfig := CircuitBreakerConfig{ - Namespace: "test-namespace", - Name: "test-circuit-breaker", - Percentage: 50, // 50% threshold (5 out of 10 nodes) - Duration: 5 * time.Minute, - } - - cfg := ReconcilerConfig{ - K8sClient: mockK8sClient, - CircuitBreakerEnabled: true, - CircuitBreaker: circuitBreakerConfig, - } - - r := NewReconciler(ctx, cfg, nil) - - // Add same node multiple times - should only count once - for i := 0; i < 10; i++ { - r.cb.AddCordonEvent("node-1") - } - - // Should not be tripped because only 1 unique node was cordoned - isTripped, err := r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - if isTripped { - t.Error("Circuit breaker should not be tripped with only 1 unique node cordoned") - } - - // Add 4 more unique nodes to reach threshold - for i := 2; i <= 5; i++ { - r.cb.AddCordonEvent(fmt.Sprintf("node-%d", i)) - } - - // Now should be tripped (5 unique nodes = 50%) - isTripped, err = r.cb.IsTripped(ctx) - if err != nil { - t.Fatalf("Error checking if circuit breaker is tripped: %v", err) - } - if !isTripped { - t.Error("Circuit breaker should be tripped with 5 unique nodes cordoned") - } -} - -// TestBackwardCompatibilityAppendNewEvent tests that when an old single-event annotation exists -// and a new fatal event arrives, the system converts to the new format and appends the new event -func TestBackwardCompatibilityAppendNewEvent(t *testing.T) { - ctx := context.Background() - - // Existing old format annotation (single event) - existingOldEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "GpuXidError", - Version: 1, - IsHealthy: false, - IsFatal: true, - Message: "XID 62 error", - ErrorCode: []string{"62"}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - - oldAnnotationStr, _ := json.Marshal(existingOldEvent) - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: string(oldAnnotationStr), - quarantineHealthEventIsCordonedAnnotationKey: "True", - quarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"gpu-xid-error","Value":"true","Effect":"NoSchedule"}]`, - } - - updateCount := 0 - var capturedAnnotation string - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - updateCount++ - for k, v := range annotations { - annotationsMap[k] = v - if k == quarantineHealthEventAnnotationKey { - capturedAnnotation = v - } - } - return nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - // Already cordoned, shouldn't be called - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{ - K8sClient: k8sMock, - }, nil) - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, true) - - // New fatal event for a different GPU - newEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "NVLinkError", - Version: 1, - IsHealthy: false, - IsFatal: true, - Message: "NVLink down", - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "1"}, - }, - }, - } - - // Create mock evaluator - mockEval := &mockEvaluator{ - name: "ruleset1", - ok: true, - ruleEvalResult: common.RuleEvaluationSuccess, - priority: 10, - version: "v1", - } - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{mockEval} - - defaultTaint := config.Taint{Key: "gpu-error", Value: "true", Effect: "NoSchedule"} - rulesetsConf := rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{ - "ruleset1": &defaultTaint, - }, - CordonConfigMap: map[string]bool{ - "ruleset1": true, - }, - RuleSetPriorityMap: map[string]int{ - "ruleset1": 10, - }, - } - - status, _ := r.handleEvent(ctx, newEvent, ruleSetEvals, rulesetsConf) - - // Verify conversion happened and new event was added - if updateCount < 1 { - t.Errorf("Expected annotation update for format conversion and new event addition") - } - - // Verify the annotation was converted to new format - var newFormatMap healthEventsAnnotation.HealthEventsAnnotationMap - if err := json.Unmarshal([]byte(capturedAnnotation), &newFormatMap); err != nil { - t.Fatalf("Failed to unmarshal new format annotation: %v", err) - } - - // Should have both events tracked - if newFormatMap.Count() != 2 { - t.Errorf("Expected 2 events in new format (1 converted + 1 new), got %d", newFormatMap.Count()) - } - - // Verify both events are present - if _, found := newFormatMap.GetEvent(existingOldEvent); !found { - t.Errorf("Converted old event not found in new format") - } - if _, found := newFormatMap.GetEvent(newEvent.HealthEvent); !found { - t.Errorf("New event not found in new format") - } - - if status == nil || *status != model.AlreadyQuarantined { - t.Errorf("Expected AlreadyQuarantined status, got %v", status) - } -} - -// TestBackwardCompatibilityHealthyEventRemoval tests that when an old single-event annotation exists -// and the corresponding healthy event arrives, the system converts format and handles recovery -func TestBackwardCompatibilityHealthyEventRemoval(t *testing.T) { - ctx := context.Background() - - // Existing old format annotation (single unhealthy event) - existingOldEvent := &protos.HealthEvent{ - NodeName: "node1", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "GpuXidError", - Version: 1, - IsHealthy: false, - IsFatal: true, - Message: "XID 62 error", - ErrorCode: []string{"62"}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - } - - oldAnnotationStr, _ := json.Marshal(existingOldEvent) - annotationsMap := map[string]string{ - quarantineHealthEventAnnotationKey: string(oldAnnotationStr), - quarantineHealthEventIsCordonedAnnotationKey: "True", - quarantineHealthEventAppliedTaintsAnnotationKey: `[{"Key":"gpu-xid-error","Value":"true","Effect":"NoSchedule"}]`, - } - - uncordonCalled := false - updateCount := 0 - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - return annotationsMap, nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - updateCount++ - for k, v := range annotations { - annotationsMap[k] = v - } - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - uncordonCalled = true - // Clear annotations to simulate removal - for _, key := range annotationKeys { - delete(annotationsMap, key) - } - return nil - }, - } - - r := NewReconciler(ctx, ReconcilerConfig{K8sClient: k8sMock}, nil) - r.nodeInfo.MarkNodeQuarantineStatusCache("node1", true, true) - - // Corresponding healthy event - healthyEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - NodeName: "node1", - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "GpuXidError", - Version: 1, - IsHealthy: true, - Message: "GPU recovered", - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "0"}, - }, - }, - } - - status, _ := r.handleEvent(ctx, healthyEvent, nil, rulesetsConfig{}) - - // Verify format conversion happened first - if updateCount < 1 { - t.Errorf("Expected at least one annotation update for format conversion") - } - - // Verify uncordon was called - if !uncordonCalled { - t.Errorf("Expected node to be uncordoned after healthy event") - } - - // Verify status - if status == nil || *status != model.UnQuarantined { - t.Errorf("Expected UnQuarantined status, got %v", status) - } - - // Verify annotations were removed - if _, exists := annotationsMap[quarantineHealthEventAnnotationKey]; exists { - t.Errorf("Expected quarantine annotation to be removed after recovery") - } -} - -func TestEntityLevelQuarantineAndRecovery(t *testing.T) { - ctx := context.Background() - - // Track what operations were called - var updateAnnotationsCalled bool - var uncordonCalled bool - var lastAnnotations map[string]string - - k8sMock := &mockK8sClient{ - getNodeAnnotationsFn: func(ctx context.Context, nodeName string) (map[string]string, error) { - // Initially no annotations - if lastAnnotations == nil { - return map[string]string{}, nil - } - return lastAnnotations, nil - }, - taintAndCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isCordon bool, annotations map[string]string, labelMap map[string]string) error { - // Store the annotations for future calls - lastAnnotations = make(map[string]string) - for k, v := range annotations { - lastAnnotations[k] = v - } - return nil - }, - updateNodeAnnotationsFn: func(ctx context.Context, nodeName string, annotations map[string]string) error { - updateAnnotationsCalled = true - // Update stored annotations - if lastAnnotations == nil { - lastAnnotations = make(map[string]string) - } - for k, v := range annotations { - lastAnnotations[k] = v - } - return nil - }, - unTaintAndUnCordonNodeFn: func(ctx context.Context, nodeName string, taints []config.Taint, isUncordon bool, annotationKeys []string, labelsToRemove []string, labelMap map[string]string) error { - uncordonCalled = true - // Clear annotations after uncordon - lastAnnotations = map[string]string{} - return nil - }, - } - tomlConfig := config.TomlConfig{ - LabelPrefix: "k88s.nvidia.com/", - RuleSets: []config.RuleSet{ - { - Name: "ruleset1", - Taint: config.Taint{ - Key: "key1", - Value: "val1", - Effect: "NoSchedule", - }, - Cordon: config.Cordon{ShouldCordon: false}, - Priority: 10, - }, - { - Name: "ruleset2", - Taint: config.Taint{ - Key: "key2", - Value: "val2", - Effect: "NoExecute", - }, - Cordon: config.Cordon{ShouldCordon: true}, - Priority: 5, - }, - }, - } - r := NewReconciler(ctx, ReconcilerConfig{ - K8sClient: k8sMock, - TomlConfig: tomlConfig, - }, nil) - r.SetLabelKeys("k88s.nvidia.com/") - - // Create mock evaluators for rule evaluation - mockEval1 := &mockEvaluator{ - name: "ruleset1", - ok: true, - ruleEvalResult: common.RuleEvaluationSuccess, - priority: 10, - version: "v1", - } - - mockEval2 := &mockEvaluator{ - name: "ruleset2", - ok: true, - ruleEvalResult: common.RuleEvaluationSuccess, - priority: 5, - version: "v1", - } - - ruleSetEvals := []evaluator.RuleSetEvaluatorIface{mockEval1, mockEval2} - - // Properly configure rulesetsConfig - rulesetsConf := rulesetsConfig{ - TaintConfigMap: map[string]*config.Taint{ - "ruleset1": &tomlConfig.RuleSets[0].Taint, - "ruleset2": &tomlConfig.RuleSets[1].Taint, - }, - CordonConfigMap: map[string]bool{ - "ruleset1": tomlConfig.RuleSets[0].Cordon.ShouldCordon, - "ruleset2": tomlConfig.RuleSets[1].Cordon.ShouldCordon, - }, - RuleSetPriorityMap: map[string]int{ - "ruleset1": tomlConfig.RuleSets[0].Priority, - "ruleset2": tomlConfig.RuleSets[1].Priority, - }, - } - - // Test 1: Initial GPU 1 failure should quarantine node - gpu1FailEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "GpuXidError", - NodeName: "node1", - Version: 1, - IsFatal: true, - IsHealthy: false, - Message: "XID error occurred", - ErrorCode: []string{"62"}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "1"}, - }, - }, - } - - status1, _ := r.handleEvent(ctx, gpu1FailEvent, ruleSetEvals, rulesetsConf) - if status1 == nil || *status1 != model.Quarantined { - t.Errorf("Expected GPU 1 failure to quarantine node, got status: %v", status1) - } - - // Verify annotation contains GPU 1 failure - healthEventAnnotation := lastAnnotations[common.QuarantineHealthEventAnnotationKey] - if !strings.Contains(healthEventAnnotation, `"entityValue":"1"`) { - t.Errorf("Annotation should contain GPU 1 entity: %s", healthEventAnnotation) - } - - // Test 2: GPU 2 failure should be added to existing quarantine - updateAnnotationsCalled = false - gpu2FailEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "GpuXidError", - NodeName: "node1", - Version: 1, - IsFatal: true, - IsHealthy: false, - Message: "XID error occurred", - ErrorCode: []string{"62"}, - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "2"}, - }, - }, - } - - status2, _ := r.handleEvent(ctx, gpu2FailEvent, ruleSetEvals, rulesetsConf) - if status2 == nil || *status2 != model.AlreadyQuarantined { - t.Errorf("Expected GPU 2 failure to be added to quarantine, got status: %v", status2) - } - - if !updateAnnotationsCalled { - t.Errorf("UpdateNodeAnnotations should be called when adding GPU 2 failure") - } - - // Verify annotation now contains both GPU 1 and GPU 2 - healthEventAnnotation = lastAnnotations[common.QuarantineHealthEventAnnotationKey] - if !strings.Contains(healthEventAnnotation, `"entityValue":"1"`) { - t.Errorf("Annotation should still contain GPU 1: %s", healthEventAnnotation) - } - if !strings.Contains(healthEventAnnotation, `"entityValue":"2"`) { - t.Errorf("Annotation should now contain GPU 2: %s", healthEventAnnotation) - } - - // Test 3: GPU 1 recovery should remove only GPU 1, node stays quarantined - updateAnnotationsCalled = false - uncordonCalled = false - gpu1RecoveryEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "GpuXidError", - NodeName: "node1", - Version: 1, - IsHealthy: true, - Message: "No health failures", - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "1"}, // Only GPU 1 recovers - }, - }, - } - - status3, _ := r.handleEvent(ctx, gpu1RecoveryEvent, ruleSetEvals, rulesetsConf) - if status3 == nil || *status3 != model.AlreadyQuarantined { - t.Errorf("Expected partial recovery to keep node quarantined, got status: %v", status3) - } - - if !updateAnnotationsCalled { - t.Errorf("UpdateNodeAnnotations should be called for partial recovery") - } - if uncordonCalled { - t.Errorf("Node should not be uncordoned during partial recovery") - } - - // Verify annotation now contains only GPU 2 - healthEventAnnotation = lastAnnotations[common.QuarantineHealthEventAnnotationKey] - if strings.Contains(healthEventAnnotation, `"entityValue":"1"`) { - t.Errorf("Annotation should not contain GPU 1 after recovery: %s", healthEventAnnotation) - } - if !strings.Contains(healthEventAnnotation, `"entityValue":"2"`) { - t.Errorf("Annotation should still contain GPU 2: %s", healthEventAnnotation) - } - - // Test 4: GPU 2 recovery should uncordon node - updateAnnotationsCalled = false - uncordonCalled = false - gpu2RecoveryEvent := &model.HealthEventWithStatus{ - HealthEvent: &protos.HealthEvent{ - Agent: "gpu-health-monitor", - ComponentClass: "GPU", - CheckName: "GpuXidError", - NodeName: "node1", - Version: 1, - IsHealthy: true, - Message: "No health failures", - EntitiesImpacted: []*protos.Entity{ - {EntityType: "GPU", EntityValue: "2"}, // GPU 2 recovers - }, - }, - } - - status4, _ := r.handleEvent(ctx, gpu2RecoveryEvent, ruleSetEvals, rulesetsConf) - if status4 == nil || *status4 != model.UnQuarantined { - t.Errorf("Expected complete recovery to unquarantine node, got status: %v", status4) - } - - if uncordonCalled != true { - t.Errorf("Node should be uncordoned after all entities recover") - } -} diff --git a/fault-remediation-module/go.mod b/fault-remediation-module/go.mod index 6ee086424..d60bd4b45 100644 --- a/fault-remediation-module/go.mod +++ b/fault-remediation-module/go.mod @@ -46,6 +46,7 @@ require ( github.com/golang/snappy v1.0.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect + github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d // indirect github.com/google/uuid v1.6.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.0 // indirect @@ -53,6 +54,8 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/montanaflynn/stats v0.7.1 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/onsi/ginkgo/v2 v2.26.0 // indirect + github.com/onsi/gomega v1.38.2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.1 // indirect @@ -63,6 +66,8 @@ require ( github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.43.0 // indirect @@ -72,6 +77,7 @@ require ( golang.org/x/term v0.36.0 // indirect golang.org/x/text v0.30.0 // indirect golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.38.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect google.golang.org/grpc v1.76.0 // indirect google.golang.org/protobuf v1.36.10 // indirect diff --git a/fault-remediation-module/go.sum b/fault-remediation-module/go.sum index 78dae9888..35040d5e9 100644 --- a/fault-remediation-module/go.sum +++ b/fault-remediation-module/go.sum @@ -1,3 +1,5 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -63,8 +65,8 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d h1:KJIErDwbSHjnp/SGzE5ed8Aol7JsKiI5X7yWKAtzhM0= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -89,10 +91,10 @@ github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8 github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= -github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= -github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE= +github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -133,14 +135,16 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -160,6 +164,8 @@ golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -202,8 +208,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= -golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/health-events-analyzer/go.mod b/health-events-analyzer/go.mod index 3d5b661fc..5d6958e42 100644 --- a/health-events-analyzer/go.mod +++ b/health-events-analyzer/go.mod @@ -37,6 +37,8 @@ require ( github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect golang.org/x/crypto v0.43.0 // indirect golang.org/x/net v0.46.0 // indirect diff --git a/health-events-analyzer/go.sum b/health-events-analyzer/go.sum index dad28cc57..5bdda3198 100644 --- a/health-events-analyzer/go.sum +++ b/health-events-analyzer/go.sum @@ -62,14 +62,14 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= diff --git a/health-monitors/csp-health-monitor/go.mod b/health-monitors/csp-health-monitor/go.mod index 5e399477c..24c75717c 100644 --- a/health-monitors/csp-health-monitor/go.mod +++ b/health-monitors/csp-health-monitor/go.mod @@ -30,7 +30,7 @@ require ( ) require ( - cloud.google.com/go v0.121.0 // indirect + cloud.google.com/go v0.121.1 // indirect cloud.google.com/go/auth v0.17.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect diff --git a/health-monitors/csp-health-monitor/go.sum b/health-monitors/csp-health-monitor/go.sum index aedc1a1b9..57aee2fe5 100644 --- a/health-monitors/csp-health-monitor/go.sum +++ b/health-monitors/csp-health-monitor/go.sum @@ -1,7 +1,7 @@ cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= -cloud.google.com/go v0.121.0 h1:pgfwva8nGw7vivjZiRfrmglGWiCJBP+0OmDpenG/Fwg= -cloud.google.com/go v0.121.0/go.mod h1:rS7Kytwheu/y9buoDmu5EIpMMCI4Mb8ND4aeN4Vwj7Q= +cloud.google.com/go v0.121.1 h1:S3kTQSydxmu1JfLRLpKtxRPA7rSrYPRPEUmL/PavVUw= +cloud.google.com/go v0.121.1/go.mod h1:nRFlrHq39MNVWu+zESP2PosMWA0ryJw8KUBZ2iZpxbw= cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4= cloud.google.com/go/auth v0.17.0/go.mod h1:6wv/t5/6rOPAX4fJiRjKkJCvswLwdet7G8+UGXt7nCQ= cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= @@ -18,8 +18,8 @@ cloud.google.com/go/longrunning v0.6.7 h1:IGtfDWHhQCgCjwQjV9iiLnUta9LBCo8R9QmAFs cloud.google.com/go/longrunning v0.6.7/go.mod h1:EAFV3IZAKmM56TyiE6VAP3VoTzhZzySwI/YI1s/nRsY= cloud.google.com/go/monitoring v1.24.2 h1:5OTsoJ1dXYIiMiuL+sYscLc9BumrL3CarVLL7dd7lHM= cloud.google.com/go/monitoring v1.24.2/go.mod h1:x7yzPWcgDRnPEv3sI+jJGBkwl5qINf+6qY4eq0I9B4U= -cloud.google.com/go/storage v1.52.0 h1:ROpzMW/IwipKtatA69ikxibdzQSiXJrY9f6IgBa9AlA= -cloud.google.com/go/storage v1.52.0/go.mod h1:4wrBAbAYUvYkbrf19ahGm4I5kDQhESSqN3CGEkMGvOY= +cloud.google.com/go/storage v1.53.0 h1:gg0ERZwL17pJ+Cz3cD2qS60w1WMDnwcm5YPAIQBHUAw= +cloud.google.com/go/storage v1.53.0/go.mod h1:7/eO2a/srr9ImZW9k5uufcNahT2+fPb8w5it1i5boaA= github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 h1:UQUsRi8WTzhZntp5313l+CHIAT95ojUI2lpP/ExlZa4= diff --git a/janitor/go.mod b/janitor/go.mod index 651fcab90..3ded98ec8 100644 --- a/janitor/go.mod +++ b/janitor/go.mod @@ -5,16 +5,16 @@ go 1.25 toolchain go1.25.3 require ( - cloud.google.com/go/compute v1.36.1 + cloud.google.com/go/compute v1.38.0 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.9.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0 - github.com/aws/aws-sdk-go-v2/config v1.31.12 + github.com/aws/aws-sdk-go-v2/config v1.31.15 github.com/aws/aws-sdk-go-v2/service/ec2 v1.254.1 github.com/go-logr/logr v1.4.3 github.com/nvidia/nvsentinel/commons v0.0.0 - github.com/onsi/ginkgo/v2 v2.22.0 - github.com/onsi/gomega v1.36.1 + github.com/onsi/ginkgo/v2 v2.26.0 + github.com/onsi/gomega v1.38.2 github.com/oracle/oci-go-sdk/v65 v65.102.1 github.com/prometheus/client_golang v1.23.2 github.com/spf13/viper v1.21.0 @@ -26,39 +26,52 @@ require ( require ( cel.dev/expr v0.24.0 // indirect - cloud.google.com/go/auth v0.16.0 // indirect + cloud.google.com/go v0.121.1 // indirect + cloud.google.com/go/auth v0.17.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect - cloud.google.com/go/compute/metadata v0.6.0 // indirect + cloud.google.com/go/compute/metadata v0.9.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect - github.com/aws/aws-sdk-go-v2 v1.39.2 // indirect - github.com/aws/aws-sdk-go-v2/credentials v1.18.16 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.9 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.9 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.9 // indirect - github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.9 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.29.6 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.1 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.38.6 // indirect - github.com/aws/smithy-go v1.23.0 // indirect + github.com/aws/aws-sdk-go-v2 v1.39.4 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.18.19 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.11 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.11 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.11 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.2 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.11 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.29.8 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.3 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.38.9 // indirect + github.com/aws/smithy-go v1.23.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect - github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect - github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-openapi/jsonpointer v0.22.1 // indirect + github.com/go-openapi/jsonreference v0.21.2 // indirect + github.com/go-openapi/swag v0.25.1 // indirect + github.com/go-openapi/swag/cmdutils v0.25.1 // indirect + github.com/go-openapi/swag/conv v0.25.1 // indirect + github.com/go-openapi/swag/fileutils v0.25.1 // indirect + github.com/go-openapi/swag/jsonname v0.25.1 // indirect + github.com/go-openapi/swag/jsonutils v0.25.1 // indirect + github.com/go-openapi/swag/loading v0.25.1 // indirect + github.com/go-openapi/swag/mangling v0.25.1 // indirect + github.com/go-openapi/swag/netutils v0.25.1 // indirect + github.com/go-openapi/swag/stringutils v0.25.1 // indirect + github.com/go-openapi/swag/typeutils v0.25.1 // indirect + github.com/go-openapi/swag/yamlutils v0.25.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect github.com/gofrs/flock v0.10.0 // indirect @@ -68,27 +81,24 @@ require ( github.com/google/cel-go v0.26.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect + github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d // indirect github.com/google/s2a-go v0.1.9 // indirect github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect - github.com/googleapis/gax-go/v2 v2.14.1 // indirect + github.com/googleapis/gax-go/v2 v2.15.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kylelemons/godebug v1.1.0 // indirect - github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect - github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.66.1 // indirect - github.com/prometheus/procfs v0.16.1 // indirect + github.com/prometheus/common v0.67.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect github.com/sagikazarmark/locafero v0.11.0 // indirect github.com/sony/gobreaker v0.5.0 // indirect github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect @@ -100,47 +110,48 @@ require ( github.com/subosito/gotenv v1.6.0 // indirect github.com/x448/float16 v0.8.4 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect - go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect + go.opentelemetry.io/otel v1.38.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/sdk v1.35.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/sdk v1.37.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect go.opentelemetry.io/proto/otlp v1.5.0 // indirect + go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.41.0 // indirect + golang.org/x/crypto v0.43.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/net v0.43.0 // indirect - golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/net v0.46.0 // indirect + golang.org/x/oauth2 v0.32.0 // indirect golang.org/x/sync v0.17.0 // indirect - golang.org/x/sys v0.35.0 // indirect - golang.org/x/term v0.34.0 // indirect - golang.org/x/text v0.28.0 // indirect - golang.org/x/time v0.11.0 // indirect - golang.org/x/tools v0.35.0 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/term v0.36.0 // indirect + golang.org/x/text v0.30.0 // indirect + golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.38.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/api v0.229.0 // indirect - google.golang.org/genproto v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250414145226-207652e42e2e // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect - google.golang.org/grpc v1.72.1 // indirect - google.golang.org/protobuf v1.36.8 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + google.golang.org/api v0.253.0 // indirect + google.golang.org/genproto v0.0.0-20250603155806-513f23925822 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect + google.golang.org/grpc v1.76.0 // indirect + google.golang.org/protobuf v1.36.10 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.34.1 // indirect k8s.io/apiserver v0.34.1 // indirect k8s.io/component-base v0.34.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect diff --git a/janitor/go.sum b/janitor/go.sum index cc0a0e454..8285e305f 100644 --- a/janitor/go.sum +++ b/janitor/go.sum @@ -1,15 +1,15 @@ cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= -cloud.google.com/go v0.120.0 h1:wc6bgG9DHyKqF5/vQvX1CiZrtHnxJjBlKUyF9nP6meA= -cloud.google.com/go v0.120.0/go.mod h1:/beW32s8/pGRuj4IILWQNd4uuebeT4dkOhKmkfit64Q= -cloud.google.com/go/auth v0.16.0 h1:Pd8P1s9WkcrBE2n/PhAwKsdrR35V3Sg2II9B+ndM3CU= -cloud.google.com/go/auth v0.16.0/go.mod h1:1howDHJ5IETh/LwYs3ZxvlkXF48aSqqJUM+5o02dNOI= +cloud.google.com/go v0.121.1 h1:S3kTQSydxmu1JfLRLpKtxRPA7rSrYPRPEUmL/PavVUw= +cloud.google.com/go v0.121.1/go.mod h1:nRFlrHq39MNVWu+zESP2PosMWA0ryJw8KUBZ2iZpxbw= +cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4= +cloud.google.com/go/auth v0.17.0/go.mod h1:6wv/t5/6rOPAX4fJiRjKkJCvswLwdet7G8+UGXt7nCQ= cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= -cloud.google.com/go/compute v1.36.1 h1:uG7oFEbaElkxgzT0jpX9fQp/tdsR/P7+UL7D90Zj0Lc= -cloud.google.com/go/compute v1.36.1/go.mod h1:AsK4VqrSyXBo4SMbRtfAO1VfaMjUEjEwv1UB/AwVp5Q= -cloud.google.com/go/compute/metadata v0.6.0 h1:A6hENjEsCDtC1k8byVsgwvVcioamEHvZ4j01OwKxG9I= -cloud.google.com/go/compute/metadata v0.6.0/go.mod h1:FjyFAW1MW0C203CEOMDTu3Dk1FlqW3Rga40jzHL4hfg= +cloud.google.com/go/compute v1.38.0 h1:MilCLYQW2m7Dku8hRIIKo4r0oKastlD74sSu16riYKs= +cloud.google.com/go/compute v1.38.0/go.mod h1:oAFNIuXOmXbK/ssXm3z4nZB8ckPdjltJ7xhHCdbWFZM= +cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= +cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 h1:Gt0j3wceWMwPmiazCa8MzMA0MfhmPIz0Qp0FJ6qcM0U= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0/go.mod h1:Ot/6aikWnKWi4l9QB7qVSwa8iMphQNqkWALMoNT3rzM= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.9.0 h1:OVoM452qUFBrX+URdH3VpR299ma4kfom0yB0URYky9g= @@ -30,36 +30,38 @@ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 h1:oygO0locgZJe7PpYPXT5A29ZkwJaPqcva7BVeemZOZs= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= -github.com/aws/aws-sdk-go-v2 v1.39.2 h1:EJLg8IdbzgeD7xgvZ+I8M1e0fL0ptn/M47lianzth0I= -github.com/aws/aws-sdk-go-v2 v1.39.2/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= -github.com/aws/aws-sdk-go-v2/config v1.31.12 h1:pYM1Qgy0dKZLHX2cXslNacbcEFMkDMl+Bcj5ROuS6p8= -github.com/aws/aws-sdk-go-v2/config v1.31.12/go.mod h1:/MM0dyD7KSDPR+39p9ZNVKaHDLb9qnfDurvVS2KAhN8= -github.com/aws/aws-sdk-go-v2/credentials v1.18.16 h1:4JHirI4zp958zC026Sm+V4pSDwW4pwLefKrc0bF2lwI= -github.com/aws/aws-sdk-go-v2/credentials v1.18.16/go.mod h1:qQMtGx9OSw7ty1yLclzLxXCRbrkjWAM7JnObZjmCB7I= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.9 h1:Mv4Bc0mWmv6oDuSWTKnk+wgeqPL5DRFu5bQL9BGPQ8Y= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.9/go.mod h1:IKlKfRppK2a1y0gy1yH6zD+yX5uplJ6UuPlgd48dJiQ= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.9 h1:se2vOWGD3dWQUtfn4wEjRQJb1HK1XsNIt825gskZ970= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.9/go.mod h1:hijCGH2VfbZQxqCDN7bwz/4dzxV+hkyhjawAtdPWKZA= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.9 h1:6RBnKZLkJM4hQ+kN6E7yWFveOTg8NLPHAkqrs4ZPlTU= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.9/go.mod h1:V9rQKRmK7AWuEsOMnHzKj8WyrIir1yUJbZxDuZLFvXI= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= +github.com/aws/aws-sdk-go-v2 v1.39.4 h1:qTsQKcdQPHnfGYBBs+Btl8QwxJeoWcOcPcixK90mRhg= +github.com/aws/aws-sdk-go-v2 v1.39.4/go.mod h1:yWSxrnioGUZ4WVv9TgMrNUeLV3PFESn/v+6T/Su8gnM= +github.com/aws/aws-sdk-go-v2/config v1.31.15 h1:gE3M4xuNXfC/9bG4hyowGm/35uQTi7bUKeYs5e/6uvU= +github.com/aws/aws-sdk-go-v2/config v1.31.15/go.mod h1:HvnvGJoE2I95KAIW8kkWVPJ4XhdrlvwJpV6pEzFQa8o= +github.com/aws/aws-sdk-go-v2/credentials v1.18.19 h1:Jc1zzwkSY1QbkEcLujwqRTXOdvW8ppND3jRBb/VhBQc= +github.com/aws/aws-sdk-go-v2/credentials v1.18.19/go.mod h1:DIfQ9fAk5H0pGtnqfqkbSIzky82qYnGvh06ASQXXg6A= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.11 h1:X7X4YKb+c0rkI6d4uJ5tEMxXgCZ+jZ/D6mvkno8c8Uw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.11/go.mod h1:EqM6vPZQsZHYvC4Cai35UDg/f5NCEU+vp0WfbVqVcZc= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.11 h1:7AANQZkF3ihM8fbdftpjhken0TP9sBzFbV/Ze/Y4HXA= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.11/go.mod h1:NTF4QCGkm6fzVwncpkFQqoquQyOolcyXfbpC98urj+c= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.11 h1:ShdtWUZT37LCAA4Mw2kJAJtzaszfSHFb5n25sdcv4YE= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.11/go.mod h1:7bUb2sSr2MZ3M/N+VyETLTQtInemHXb/Fl3s8CLzm0Y= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= github.com/aws/aws-sdk-go-v2/service/ec2 v1.254.1 h1:7p9bJCZ/b3EJXXARW7JMEs2IhsnI4YFHpfXQfgMh0eg= github.com/aws/aws-sdk-go-v2/service/ec2 v1.254.1/go.mod h1:M8WWWIfXmxA4RgTXcI/5cSByxRqjgne32Sh0VIbrn0A= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 h1:oegbebPEMA/1Jny7kvwejowCaHz1FWZAQ94WXFNCyTM= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1/go.mod h1:kemo5Myr9ac0U9JfSjMo9yHLtw+pECEHsFtJ9tqCEI8= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.9 h1:5r34CgVOD4WZudeEKZ9/iKpiT6cM1JyEROpXjOcdWv8= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.9/go.mod h1:dB12CEbNWPbzO2uC6QSWHteqOg4JfBVJOojbAoAUb5I= -github.com/aws/aws-sdk-go-v2/service/sso v1.29.6 h1:A1oRkiSQOWstGh61y4Wc/yQ04sqrQZr1Si/oAXj20/s= -github.com/aws/aws-sdk-go-v2/service/sso v1.29.6/go.mod h1:5PfYspyCU5Vw1wNPsxi15LZovOnULudOQuVxphSflQA= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.1 h1:5fm5RTONng73/QA73LhCNR7UT9RpFH3hR6HWL6bIgVY= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.1/go.mod h1:xBEjWD13h+6nq+z4AkqSfSvqRKFgDIQeaMguAJndOWo= -github.com/aws/aws-sdk-go-v2/service/sts v1.38.6 h1:p3jIvqYwUZgu/XYeI48bJxOhvm47hZb5HUQ0tn6Q9kA= -github.com/aws/aws-sdk-go-v2/service/sts v1.38.6/go.mod h1:WtKK+ppze5yKPkZ0XwqIVWD4beCwv056ZbPQNoeHqM8= -github.com/aws/smithy-go v1.23.0 h1:8n6I3gXzWJB2DxBDnfxgBaSX6oe0d/t10qGz7OKqMCE= -github.com/aws/smithy-go v1.23.0/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.2 h1:xtuxji5CS0JknaXoACOunXOYOQzgfTvGAc9s2QdCJA4= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.2/go.mod h1:zxwi0DIR0rcRcgdbl7E2MSOvxDyyXGBlScvBkARFaLQ= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.11 h1:GpMf3z2KJa4RnJ0ew3Hac+hRFYLZ9DDjfgXjuW+pB54= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.11/go.mod h1:6MZP3ZI4QQsgUCFTwMZA2V0sEriNQ8k2hmoHF3qjimQ= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.8 h1:M5nimZmugcZUO9wG7iVtROxPhiqyZX6ejS1lxlDPbTU= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.8/go.mod h1:mbef/pgKhtKRwrigPPs7SSSKZgytzP8PQ6P6JAAdqyM= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.3 h1:S5GuJZpYxE0lKeMHKn+BRTz6PTFpgThyJ+5mYfux7BM= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.3/go.mod h1:X4OF+BTd7HIb3L+tc4UlWHVrpgwZZIVENU15pRDVTI0= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.9 h1:Ekml5vGg6sHSZLZJQJagefnVe6PmqC2oiRkBq4F7fU0= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.9/go.mod h1:/e15V+o1zFHWdH3u7lpI3rVBcxszktIKuHKCY2/py+k= +github.com/aws/smithy-go v1.23.1 h1:sLvcH6dfAFwGkHLZ7dGiYF7aK6mg4CgKA/iDKjLDt9M= +github.com/aws/smithy-go v1.23.1/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -69,14 +71,14 @@ github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyY github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -89,6 +91,12 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.14 h1:3fAqdB6BCPKHDMHAKRwtPUwYexKtGrNuw8HX/T/4neo= +github.com/gkampitakis/go-snaps v0.5.14/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -96,18 +104,42 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= -github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= -github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-openapi/jsonpointer v0.22.1 h1:sHYI1He3b9NqJ4wXLoJDKmUmHkWy/L7rtEo92JUxBNk= +github.com/go-openapi/jsonpointer v0.22.1/go.mod h1:pQT9OsLkfz1yWoMgYFy4x3U5GY5nUlsOn1qSBH5MkCM= +github.com/go-openapi/jsonreference v0.21.2 h1:Wxjda4M/BBQllegefXrY/9aq1fxBA8sI5M/lFU6tSWU= +github.com/go-openapi/jsonreference v0.21.2/go.mod h1:pp3PEjIsJ9CZDGCNOyXIQxsNuroxm8FAJ/+quA0yKzQ= +github.com/go-openapi/swag v0.25.1 h1:6uwVsx+/OuvFVPqfQmOOPsqTcm5/GkBhNwLqIR916n8= +github.com/go-openapi/swag v0.25.1/go.mod h1:bzONdGlT0fkStgGPd3bhZf1MnuPkf2YAys6h+jZipOo= +github.com/go-openapi/swag/cmdutils v0.25.1 h1:nDke3nAFDArAa631aitksFGj2omusks88GF1VwdYqPY= +github.com/go-openapi/swag/cmdutils v0.25.1/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.1 h1:+9o8YUg6QuqqBM5X6rYL/p1dpWeZRhoIt9x7CCP+he0= +github.com/go-openapi/swag/conv v0.25.1/go.mod h1:Z1mFEGPfyIKPu0806khI3zF+/EUXde+fdeksUl2NiDs= +github.com/go-openapi/swag/fileutils v0.25.1 h1:rSRXapjQequt7kqalKXdcpIegIShhTPXx7yw0kek2uU= +github.com/go-openapi/swag/fileutils v0.25.1/go.mod h1:+NXtt5xNZZqmpIpjqcujqojGFek9/w55b3ecmOdtg8M= +github.com/go-openapi/swag/jsonname v0.25.1 h1:Sgx+qbwa4ej6AomWC6pEfXrA6uP2RkaNjA9BR8a1RJU= +github.com/go-openapi/swag/jsonname v0.25.1/go.mod h1:71Tekow6UOLBD3wS7XhdT98g5J5GR13NOTQ9/6Q11Zo= +github.com/go-openapi/swag/jsonutils v0.25.1 h1:AihLHaD0brrkJoMqEZOBNzTLnk81Kg9cWr+SPtxtgl8= +github.com/go-openapi/swag/jsonutils v0.25.1/go.mod h1:JpEkAjxQXpiaHmRO04N1zE4qbUEg3b7Udll7AMGTNOo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1 h1:DSQGcdB6G0N9c/KhtpYc71PzzGEIc/fZ1no35x4/XBY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1/go.mod h1:kjmweouyPwRUEYMSrbAidoLMGeJ5p6zdHi9BgZiqmsg= +github.com/go-openapi/swag/loading v0.25.1 h1:6OruqzjWoJyanZOim58iG2vj934TysYVptyaoXS24kw= +github.com/go-openapi/swag/loading v0.25.1/go.mod h1:xoIe2EG32NOYYbqxvXgPzne989bWvSNoWoyQVWEZicc= +github.com/go-openapi/swag/mangling v0.25.1 h1:XzILnLzhZPZNtmxKaz/2xIGPQsBsvmCjrJOWGNz/ync= +github.com/go-openapi/swag/mangling v0.25.1/go.mod h1:CdiMQ6pnfAgyQGSOIYnZkXvqhnnwOn997uXZMAd/7mQ= +github.com/go-openapi/swag/netutils v0.25.1 h1:2wFLYahe40tDUHfKT1GRC4rfa5T1B4GWZ+msEFA4Fl4= +github.com/go-openapi/swag/netutils v0.25.1/go.mod h1:CAkkvqnUJX8NV96tNhEQvKz8SQo2KF0f7LleiJwIeRE= +github.com/go-openapi/swag/stringutils v0.25.1 h1:Xasqgjvk30eUe8VKdmyzKtjkVjeiXx1Iz0zDfMNpPbw= +github.com/go-openapi/swag/stringutils v0.25.1/go.mod h1:JLdSAq5169HaiDUbTvArA2yQxmgn4D6h4A+4HqVvAYg= +github.com/go-openapi/swag/typeutils v0.25.1 h1:rD/9HsEQieewNt6/k+JBwkxuAHktFtH3I3ysiFZqukA= +github.com/go-openapi/swag/typeutils v0.25.1/go.mod h1:9McMC/oCdS4BKwk2shEB7x17P6HmMmA6dQRtAkSnNb8= +github.com/go-openapi/swag/yamlutils v0.25.1 h1:mry5ez8joJwzvMbaTGLhw8pXUnhDK91oSJLDPF1bmGk= +github.com/go-openapi/swag/yamlutils v0.25.1/go.mod h1:cm9ywbzncy3y6uPm/97ysW8+wZ09qsks+9RS8fLWKqg= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/gofrs/flock v0.10.0 h1:SHMXenfaB03KbroETaCMtbBg3Yn29v4w1r+tgy4ff4k= github.com/gofrs/flock v0.10.0/go.mod h1:FirDy1Ing0mI2+kB6wk+vyyAH+e6xiE+EYA0jnzV9jc= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= @@ -127,22 +159,22 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d h1:KJIErDwbSHjnp/SGzE5ed8Aol7JsKiI5X7yWKAtzhM0= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4= github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= -github.com/googleapis/gax-go/v2 v2.14.1 h1:hb0FFeiPaQskmvakKu5EbCbpntQn48jyHuvrkurSS/Q= -github.com/googleapis/gax-go/v2 v2.14.1/go.mod h1:Hb/NubMaVM88SrNkvl8X/o8XWwDJEPqouaLeN2IUxoA= +github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo= +github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= @@ -151,17 +183,16 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -170,10 +201,10 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= -github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= -github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE= +github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/oracle/oci-go-sdk/v65 v65.102.1 h1:zLNLz5dVzZxOf5DK/f3WGZUjwrQ9m27fd4abOFwQRCQ= github.com/oracle/oci-go-sdk/v65 v65.102.1/go.mod h1:oB8jFGVc/7/zJ+DbleE8MzGHjhs2ioCz5stRTdZdIcY= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= @@ -182,16 +213,19 @@ github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmd github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= -github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/common v0.67.1 h1:OTSON1P4DNxzTg4hmKCc37o4ZAZDv0cfXLkOt0oEowI= +github.com/prometheus/common v0.67.1/go.mod h1:RpmT9v35q2Y+lsieQsdOh5sXZ6ajUGC8NjZAmr8vb0Q= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM= github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= @@ -229,59 +263,71 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 h1:OeNbIYk/2C15ckl7glBlOBp5+WlYsOElzTNmiPW/x60= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0/go.mod h1:7Bept48yIeqxP2OZ9/AqIpYS94h2or0aB4FypJTc8ZM= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0uaNS4c98WRNUEx5U3aDlrDOI5Rs+1Vifcw4DJ8U= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= -go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= -go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= -go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= -golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= +golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= -golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= -golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= -golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= +golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= +golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -291,45 +337,47 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= -golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q= +golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= -golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= -golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= -golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= -golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/api v0.229.0 h1:p98ymMtqeJ5i3lIBMj5MpR9kzIIgzpHHh8vQ+vgAzx8= -google.golang.org/api v0.229.0/go.mod h1:wyDfmq5g1wYJWn29O22FDWN48P7Xcz0xz+LBpptYvB0= -google.golang.org/genproto v0.0.0-20250303144028-a0af3efb3deb h1:ITgPrl429bc6+2ZraNSzMDk3I95nmQln2fuPstKwFDE= -google.golang.org/genproto v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:sAo5UzpjUwgFBCzupwhcLcxHVDK7vG5IqI30YnwX2eE= -google.golang.org/genproto/googleapis/api v0.0.0-20250414145226-207652e42e2e h1:UdXH7Kzbj+Vzastr5nVfccbmFsmYNygVLSPk1pEfDoY= -google.golang.org/genproto/googleapis/api v0.0.0-20250414145226-207652e42e2e/go.mod h1:085qFyf2+XaZlRdCgKNCIZ3afY2p4HHZdoIRpId8F4A= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e h1:ztQaXfzEXTmCBvbtWYRhJxW+0iJcz2qXfd38/e9l7bA= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= -google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= -google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= -google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/api v0.253.0 h1:apU86Eq9Q2eQco3NsUYFpVTfy7DwemojL7LmbAj7g/I= +google.golang.org/api v0.253.0/go.mod h1:PX09ad0r/4du83vZVAaGg7OaeyGnaUmT/CYPNvtLCbw= +google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4= +google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s= +google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b h1:ULiyYQ0FdsJhwwZUwbaXpZF5yUE3h+RA+gxvBu37ucc= +google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b/go.mod h1:oDOGiMSXHL4sDTJvFvIB9nRQCGdLP1o/iVaqQK8zB+M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f h1:1FTH6cpXFsENbPR5Bu8NQddPSaUUE6NA2XdZdDSAJK4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= +google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -349,16 +397,16 @@ k8s.io/component-base v0.34.1 h1:v7xFgG+ONhytZNFpIz5/kecwD+sUhVE6HU7qQUiRM4A= k8s.io/component-base v0.34.1/go.mod h1:mknCpLlTSKHzAQJJnnHVKqjxR7gBeHRv0rPXA7gdtQ0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.22.3 h1:I7mfqz/a/WdmDCEnXmSPm8/b/yRTy6JsKKENTijTq8Y= sigs.k8s.io/controller-runtime v0.22.3/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= diff --git a/node-drainer-module/go.mod b/node-drainer-module/go.mod index 5e2450271..2269cc3ae 100644 --- a/node-drainer-module/go.mod +++ b/node-drainer-module/go.mod @@ -47,6 +47,7 @@ require ( github.com/golang/snappy v1.0.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect + github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d // indirect github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -55,6 +56,8 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/montanaflynn/stats v0.7.1 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/onsi/ginkgo/v2 v2.26.0 // indirect + github.com/onsi/gomega v1.38.2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.1 // indirect @@ -65,6 +68,8 @@ require ( github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.43.0 // indirect @@ -74,6 +79,7 @@ require ( golang.org/x/term v0.36.0 // indirect golang.org/x/text v0.30.0 // indirect golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.38.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect google.golang.org/grpc v1.76.0 // indirect google.golang.org/protobuf v1.36.10 // indirect diff --git a/node-drainer-module/go.sum b/node-drainer-module/go.sum index 8fffec31d..548c83e35 100644 --- a/node-drainer-module/go.sum +++ b/node-drainer-module/go.sum @@ -1,5 +1,7 @@ github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -65,8 +67,8 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d h1:KJIErDwbSHjnp/SGzE5ed8Aol7JsKiI5X7yWKAtzhM0= +github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -96,10 +98,10 @@ github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8 github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= -github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= -github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE= +github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -140,14 +142,16 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -167,6 +171,8 @@ golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -209,8 +215,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= -golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/store-client-sdk/pkg/storewatcher/watch_store_mock.go b/store-client-sdk/pkg/storewatcher/watch_store_mock.go new file mode 100644 index 000000000..c421a6a4e --- /dev/null +++ b/store-client-sdk/pkg/storewatcher/watch_store_mock.go @@ -0,0 +1,173 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package storewatcher + +import ( + "context" + "sync" + + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/bson/primitive" +) + +// FakeChangeStreamWatcher provides a fake implementation of the ChangeStreamWatcher +// for testing purposes. It allows customization of behavior through function fields. +type FakeChangeStreamWatcher struct { + // EventsChan is the buffered channel (buffer size 10) that Events() returns + EventsChan chan bson.M + + // Function fields allow customization of mock behavior + StartFn func(ctx context.Context) + MarkProcessedFn func(ctx context.Context) error + CloseFn func(ctx context.Context) error + GetUnprocessedEventCountFn func(ctx context.Context, lastProcessedID primitive.ObjectID, + additionalFilters ...bson.M) (int64, error) + + // Call tracking fields + StartCalled int + MarkProcessedCalled int + CloseCalled int + GetUnprocessedEventCountCalled int + + // Parameter tracking for verification in tests + LastMarkProcessedCtx context.Context + LastCloseCtx context.Context + LastGetUnprocessedEventCountCtx context.Context + LastGetUnprocessedEventCountID primitive.ObjectID + LastGetUnprocessedEventCountFilters []bson.M + + mu sync.Mutex +} + +// NewFakeChangeStreamWatcher creates a new FakeChangeStreamWatcher with default behavior. +// The default behavior is safe and suitable for most tests. +func NewFakeChangeStreamWatcher() *FakeChangeStreamWatcher { + return &FakeChangeStreamWatcher{ + EventsChan: make(chan bson.M, 10), + StartFn: func(ctx context.Context) { + // Default: no-op + }, + MarkProcessedFn: func(ctx context.Context) error { + // Default: succeed + return nil + }, + CloseFn: func(ctx context.Context) error { + // Default: succeed + return nil + }, + GetUnprocessedEventCountFn: func(ctx context.Context, lastProcessedID primitive.ObjectID, + additionalFilters ...bson.M) (int64, error) { + // Default: return 0 events + return 0, nil + }, + } +} + +// Events returns the read-only events channel. +func (m *FakeChangeStreamWatcher) Events() <-chan bson.M { + return m.EventsChan +} + +// Start executes the configured Start function and tracks the call. +func (m *FakeChangeStreamWatcher) Start(ctx context.Context) { + m.mu.Lock() + defer m.mu.Unlock() + + m.StartCalled++ + if m.StartFn != nil { + m.StartFn(ctx) + } +} + +// MarkProcessed executes the configured MarkProcessed function and tracks the call. +func (m *FakeChangeStreamWatcher) MarkProcessed(ctx context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + + m.MarkProcessedCalled++ + m.LastMarkProcessedCtx = ctx + + if m.MarkProcessedFn != nil { + return m.MarkProcessedFn(ctx) + } + + return nil +} + +// Close executes the configured Close function and tracks the call. +func (m *FakeChangeStreamWatcher) Close(ctx context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + + m.CloseCalled++ + m.LastCloseCtx = ctx + + if m.CloseFn != nil { + return m.CloseFn(ctx) + } + + return nil +} + +// GetUnprocessedEventCount executes the configured GetUnprocessedEventCount function and tracks the call. +func (m *FakeChangeStreamWatcher) GetUnprocessedEventCount( + ctx context.Context, + lastProcessedID primitive.ObjectID, + additionalFilters ...bson.M, +) (int64, error) { + m.mu.Lock() + defer m.mu.Unlock() + + m.GetUnprocessedEventCountCalled++ + m.LastGetUnprocessedEventCountCtx = ctx + m.LastGetUnprocessedEventCountID = lastProcessedID + m.LastGetUnprocessedEventCountFilters = additionalFilters + + if m.GetUnprocessedEventCountFn != nil { + return m.GetUnprocessedEventCountFn(ctx, lastProcessedID, additionalFilters...) + } + + return 0, nil +} + +// Reset clears all call counters, tracked parameters, and drains any pending events from EventsChan. +// This is useful when reusing the same fake across multiple test cases. +func (m *FakeChangeStreamWatcher) Reset() { + m.mu.Lock() + defer m.mu.Unlock() + + m.StartCalled = 0 + m.MarkProcessedCalled = 0 + m.CloseCalled = 0 + m.GetUnprocessedEventCountCalled = 0 + + m.LastMarkProcessedCtx = nil + m.LastCloseCtx = nil + m.LastGetUnprocessedEventCountCtx = nil + m.LastGetUnprocessedEventCountID = primitive.ObjectID{} + m.LastGetUnprocessedEventCountFilters = nil + + for len(m.EventsChan) > 0 { + <-m.EventsChan + } +} + +// GetCallCounts returns the current call counts for all methods in a thread-safe manner. +func (m *FakeChangeStreamWatcher) GetCallCounts() (start, markProcessed, close, getUnprocessed int) { + m.mu.Lock() + defer m.mu.Unlock() + + return m.StartCalled, m.MarkProcessedCalled, m.CloseCalled, m.GetUnprocessedEventCountCalled +}