diff --git a/api/config/v1/config.go b/api/config/v1/config.go index 5df5284cf..5aac1eade 100644 --- a/api/config/v1/config.go +++ b/api/config/v1/config.go @@ -37,6 +37,7 @@ type Config struct { Resources Resources `json:"resources,omitempty" yaml:"resources,omitempty"` Sharing Sharing `json:"sharing,omitempty" yaml:"sharing,omitempty"` Imex Imex `json:"imex,omitempty" yaml:"imex,omitempty"` + Health *Health `json:"health,omitempty" yaml:"health,omitempty"` } // NewConfig builds out a Config struct from a config file (or command line flags). @@ -77,6 +78,16 @@ func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) { config.Sharing.MPS.FailRequestsGreaterThanOne = true } + // Initialize health configuration with defaults if not specified + if config.Health == nil { + config.Health = DefaultHealth() + } + + // Validate health configuration + if err := config.Health.Validate(); err != nil { + return nil, fmt.Errorf("invalid health configuration: %v", err) + } + return config, nil } diff --git a/api/config/v1/health.go b/api/config/v1/health.go new file mode 100644 index 000000000..a39fd4582 --- /dev/null +++ b/api/config/v1/health.go @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1 + +import ( + "encoding/json" + "fmt" + "strings" +) + +// Health defines the configuration for device health checks +type Health struct { + // Disabled indicates whether health checks are disabled entirely + Disabled bool `json:"disabled,omitempty" yaml:"disabled,omitempty"` + // EventTypes specifies which NVML event types to monitor + EventTypes []string `json:"eventTypes,omitempty" yaml:"eventTypes,omitempty"` + // IgnoredXIDs lists XIDs that should be ignored (non-fatal) + IgnoredXIDs []uint64 `json:"ignoredXIDs,omitempty" yaml:"ignoredXIDs,omitempty"` + // CriticalXIDs specifies which XIDs are considered critical + // Can be "all" or a list of specific XIDs + CriticalXIDs *CriticalXIDsType `json:"criticalXIDs,omitempty" yaml:"criticalXIDs,omitempty"` +} + +// CriticalXIDsType represents either "all" XIDs or a specific list +type CriticalXIDsType struct { + // All indicates if all XIDs should be considered critical + All bool + // XIDs contains specific XIDs to treat as critical + XIDs []uint64 +} + +// UnmarshalJSON implements custom JSON unmarshaling for CriticalXIDsType +func (c *CriticalXIDsType) UnmarshalJSON(data []byte) error { + // Try to unmarshal as string first + var str string + if err := json.Unmarshal(data, &str); err == nil { + if strings.ToLower(str) == "all" { + c.All = true + c.XIDs = nil + return nil + } + return fmt.Errorf("invalid string value for criticalXIDs: %s", str) + } + + // Try to unmarshal as array of numbers + var xids []uint64 + if err := json.Unmarshal(data, &xids); err == nil { + c.All = false + c.XIDs = xids + return nil + } + + return fmt.Errorf("criticalXIDs must be either \"all\" or an array of numbers") +} + +// MarshalJSON implements custom JSON marshaling for CriticalXIDsType +func (c CriticalXIDsType) MarshalJSON() ([]byte, error) { + if c.All { + return json.Marshal("all") + } + return json.Marshal(c.XIDs) +} + +// UnmarshalYAML implements custom YAML unmarshaling for CriticalXIDsType +func (c *CriticalXIDsType) UnmarshalYAML(unmarshal func(interface{}) error) error { + // Try to unmarshal as string first + var str string + if err := unmarshal(&str); err == nil { + if strings.ToLower(str) == "all" { + c.All = true + c.XIDs = nil + return nil + } + return fmt.Errorf("invalid string value for criticalXIDs: %s", str) + } + + // Try to unmarshal as array of numbers + var xids []uint64 + if err := unmarshal(&xids); err == nil { + c.All = false + c.XIDs = xids + return nil + } + + return fmt.Errorf("criticalXIDs must be either \"all\" or an array of numbers") +} + +// MarshalYAML implements custom YAML marshaling for CriticalXIDsType +func (c CriticalXIDsType) MarshalYAML() (interface{}, error) { + if c.All { + return "all", nil + } + return c.XIDs, nil +} + +// DefaultHealth returns the default health configuration for standard deployments +func DefaultHealth() *Health { + return &Health{ + Disabled: false, + EventTypes: []string{ + "EventTypeXidCriticalError", + "EventTypeDoubleBitEccError", + "EventTypeSingleBitEccError", + }, + IgnoredXIDs: []uint64{ + 13, // Graphics Engine Exception + 31, // GPU memory page fault + 43, // GPU stopped processing + 45, // Preemptive cleanup, due to previous errors + 68, // Video processor exception + 109, // Context Switch Timeout Error + }, + CriticalXIDs: &CriticalXIDsType{ + All: true, + }, + } +} + +// IsCritical checks if a given XID should be treated as critical +func (h *Health) IsCritical(xid uint64) bool { + // If health checks are disabled, nothing is critical + if h.Disabled { + return false + } + + // Check if XID is in ignored list + for _, ignoredXID := range h.IgnoredXIDs { + if xid == ignoredXID { + return false + } + } + + // If no critical XIDs specified, default to all + if h.CriticalXIDs == nil { + return true + } + + // If all XIDs are critical (except ignored ones) + if h.CriticalXIDs.All { + return true + } + + // Check if XID is in critical list + for _, criticalXID := range h.CriticalXIDs.XIDs { + if xid == criticalXID { + return true + } + } + + return false +} + +// Validate checks if the health configuration is valid +func (h *Health) Validate() error { + if h == nil { + return nil + } + + // Validate event types + validEventTypes := map[string]bool{ + "EventTypeXidCriticalError": true, + "EventTypeDoubleBitEccError": true, + "EventTypeSingleBitEccError": true, + } + + for _, eventType := range h.EventTypes { + if !validEventTypes[eventType] { + return fmt.Errorf("invalid event type: %s", eventType) + } + } + + // Check for XID conflicts + if h.CriticalXIDs != nil && !h.CriticalXIDs.All && len(h.CriticalXIDs.XIDs) > 0 { + ignoredMap := make(map[uint64]bool) + for _, xid := range h.IgnoredXIDs { + ignoredMap[xid] = true + } + + for _, xid := range h.CriticalXIDs.XIDs { + if ignoredMap[xid] { + return fmt.Errorf("XID %d is in both ignored and critical lists", xid) + } + } + } + + return nil +} + +// Merge applies values from another Health config, overriding the current values with +// values from 'other'. This includes the Disabled field - if 'other' explicitly sets +// Disabled to false, it will enable health checks even if they were previously disabled. +// This is intentional behavior for configuration merging where 'other' represents a +// higher-priority configuration source. +// +// For fields that are slices (EventTypes, IgnoredXIDs) or pointers (CriticalXIDs), +// only non-empty/non-nil values from 'other' will override the current values. +func (h *Health) Merge(other *Health) { + if other == nil { + return + } + + h.Disabled = other.Disabled + + if len(other.EventTypes) > 0 { + h.EventTypes = other.EventTypes + } + + if len(other.IgnoredXIDs) > 0 { + h.IgnoredXIDs = other.IgnoredXIDs + } + + if other.CriticalXIDs != nil { + h.CriticalXIDs = other.CriticalXIDs + } +} diff --git a/api/config/v1/health_test.go b/api/config/v1/health_test.go new file mode 100644 index 000000000..933c08041 --- /dev/null +++ b/api/config/v1/health_test.go @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1 + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/require" + "sigs.k8s.io/yaml" +) + +func TestCriticalXIDsType_UnmarshalJSON(t *testing.T) { + testCases := []struct { + name string + input string + expected CriticalXIDsType + hasError bool + }{ + { + name: "all string", + input: `"all"`, + expected: CriticalXIDsType{ + All: true, + XIDs: nil, + }, + }, + { + name: "ALL string (case insensitive)", + input: `"ALL"`, + expected: CriticalXIDsType{ + All: true, + XIDs: nil, + }, + }, + { + name: "array of XIDs", + input: `[48, 79, 94]`, + expected: CriticalXIDsType{ + All: false, + XIDs: []uint64{48, 79, 94}, + }, + }, + { + name: "invalid string", + input: `"invalid"`, + hasError: true, + }, + { + name: "empty array", + input: `[]`, + expected: CriticalXIDsType{ + All: false, + XIDs: []uint64{}, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var c CriticalXIDsType + err := json.Unmarshal([]byte(tc.input), &c) + + if tc.hasError { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tc.expected, c) + } + }) + } +} + +func TestCriticalXIDsType_MarshalJSON(t *testing.T) { + testCases := []struct { + name string + input CriticalXIDsType + expected string + }{ + { + name: "all XIDs", + input: CriticalXIDsType{ + All: true, + }, + expected: `"all"`, + }, + { + name: "specific XIDs", + input: CriticalXIDsType{ + All: false, + XIDs: []uint64{48, 79}, + }, + expected: `[48,79]`, + }, + { + name: "empty XIDs", + input: CriticalXIDsType{ + All: false, + XIDs: []uint64{}, + }, + expected: `[]`, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + data, err := json.Marshal(tc.input) + require.NoError(t, err) + require.JSONEq(t, tc.expected, string(data)) + }) + } +} + +func TestCriticalXIDsType_YAML(t *testing.T) { + testCases := []struct { + name string + input string + expected CriticalXIDsType + }{ + { + name: "all string", + input: "all", + expected: CriticalXIDsType{ + All: true, + XIDs: nil, + }, + }, + { + name: "array of XIDs", + input: `- 48 +- 79 +- 94`, + expected: CriticalXIDsType{ + All: false, + XIDs: []uint64{48, 79, 94}, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var c CriticalXIDsType + err := yaml.Unmarshal([]byte(tc.input), &c) + require.NoError(t, err) + require.Equal(t, tc.expected, c) + + // Test marshaling back + data, err := yaml.Marshal(c) + require.NoError(t, err) + + // Unmarshal again to verify roundtrip + var c2 CriticalXIDsType + err = yaml.Unmarshal(data, &c2) + require.NoError(t, err) + require.Equal(t, c, c2) + }) + } +} + +func TestHealth_IsCritical(t *testing.T) { + testCases := []struct { + name string + health Health + xid uint64 + expected bool + }{ + { + name: "disabled health checks", + health: Health{ + Disabled: true, + }, + xid: 48, + expected: false, + }, + { + name: "ignored XID", + health: Health{ + IgnoredXIDs: []uint64{13, 31, 43}, + CriticalXIDs: &CriticalXIDsType{ + All: true, + }, + }, + xid: 31, + expected: false, + }, + { + name: "all XIDs critical, not in ignored", + health: Health{ + IgnoredXIDs: []uint64{13, 31}, + CriticalXIDs: &CriticalXIDsType{ + All: true, + }, + }, + xid: 48, + expected: true, + }, + { + name: "specific critical XID", + health: Health{ + CriticalXIDs: &CriticalXIDsType{ + All: false, + XIDs: []uint64{48, 79}, + }, + }, + xid: 48, + expected: true, + }, + { + name: "not in critical list", + health: Health{ + CriticalXIDs: &CriticalXIDsType{ + All: false, + XIDs: []uint64{48, 79}, + }, + }, + xid: 94, + expected: false, + }, + { + name: "nil critical XIDs defaults to all", + health: Health{ + CriticalXIDs: nil, + }, + xid: 94, + expected: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := tc.health.IsCritical(tc.xid) + require.Equal(t, tc.expected, result) + }) + } +} + +func TestHealth_Validate(t *testing.T) { + testCases := []struct { + name string + health *Health + hasError bool + errorMsg string + }{ + { + name: "nil health", + health: nil, + hasError: false, + }, + { + name: "valid configuration", + health: &Health{ + EventTypes: []string{"EventTypeXidCriticalError", "EventTypeDoubleBitEccError"}, + IgnoredXIDs: []uint64{13, 31}, + CriticalXIDs: &CriticalXIDsType{ + All: true, + }, + }, + hasError: false, + }, + { + name: "invalid event type", + health: &Health{ + EventTypes: []string{"InvalidEventType"}, + }, + hasError: true, + errorMsg: "invalid event type: InvalidEventType", + }, + { + name: "XID in both ignored and critical", + health: &Health{ + IgnoredXIDs: []uint64{48, 79}, + CriticalXIDs: &CriticalXIDsType{ + All: false, + XIDs: []uint64{48, 94}, + }, + }, + hasError: true, + errorMsg: "XID 48 is in both ignored and critical lists", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + err := tc.health.Validate() + if tc.hasError { + require.Error(t, err) + if tc.errorMsg != "" { + require.Contains(t, err.Error(), tc.errorMsg) + } + } else { + require.NoError(t, err) + } + }) + } +} + +func TestDefaultHealth(t *testing.T) { + h := DefaultHealth() + require.NotNil(t, h) + + require.False(t, h.Disabled) + require.Contains(t, h.EventTypes, "EventTypeXidCriticalError") + require.Contains(t, h.EventTypes, "EventTypeDoubleBitEccError") + require.Contains(t, h.EventTypes, "EventTypeSingleBitEccError") + + require.Contains(t, h.IgnoredXIDs, uint64(13)) + require.Contains(t, h.IgnoredXIDs, uint64(31)) + require.Contains(t, h.IgnoredXIDs, uint64(43)) + require.Contains(t, h.IgnoredXIDs, uint64(45)) + require.Contains(t, h.IgnoredXIDs, uint64(68)) + require.Contains(t, h.IgnoredXIDs, uint64(109)) + + require.NotNil(t, h.CriticalXIDs) + require.True(t, h.CriticalXIDs.All) +} + +func TestHealth_Merge(t *testing.T) { + // Note: The Merge function intentionally overwrites the Disabled field + // even when set to false. This is by design - when merging configurations, + // the 'other' config represents a higher-priority source that should + // override the base configuration. If we need to distinguish between + // "unset" and "explicitly false", we would need to change Disabled + // from bool to *bool, which would be a breaking API change. + testCases := []struct { + name string + base *Health + other *Health + expected *Health + }{ + { + name: "nil other", + base: DefaultHealth(), + other: nil, + expected: DefaultHealth(), + }, + { + name: "override disabled false to true", + base: &Health{ + Disabled: false, + }, + other: &Health{ + Disabled: true, + }, + expected: &Health{ + Disabled: true, + }, + }, + { + name: "override disabled true to false", + base: &Health{ + Disabled: true, + }, + other: &Health{ + Disabled: false, + }, + expected: &Health{ + Disabled: false, + }, + }, + { + name: "override event types", + base: &Health{ + EventTypes: []string{"EventTypeXidCriticalError"}, + }, + other: &Health{ + EventTypes: []string{"EventTypeDoubleBitEccError"}, + }, + expected: &Health{ + EventTypes: []string{"EventTypeDoubleBitEccError"}, + }, + }, + { + name: "override ignored XIDs", + base: &Health{ + IgnoredXIDs: []uint64{13, 31}, + }, + other: &Health{ + IgnoredXIDs: []uint64{43, 45}, + }, + expected: &Health{ + IgnoredXIDs: []uint64{43, 45}, + }, + }, + { + name: "override critical XIDs", + base: &Health{ + CriticalXIDs: &CriticalXIDsType{ + All: true, + }, + }, + other: &Health{ + CriticalXIDs: &CriticalXIDsType{ + All: false, + XIDs: []uint64{48}, + }, + }, + expected: &Health{ + CriticalXIDs: &CriticalXIDsType{ + All: false, + XIDs: []uint64{48}, + }, + }, + }, + { + name: "merge minimal config with disabled=false overwrites base", + base: &Health{ + Disabled: true, + EventTypes: []string{"EventTypeXidCriticalError"}, + IgnoredXIDs: []uint64{13, 31}, + }, + other: &Health{ + Disabled: false, // Explicitly enabling health checks + }, + expected: &Health{ + Disabled: false, // This is intentional - explicit false overrides + EventTypes: []string{"EventTypeXidCriticalError"}, // Unchanged + IgnoredXIDs: []uint64{13, 31}, // Unchanged + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + tc.base.Merge(tc.other) + require.Equal(t, tc.expected, tc.base) + }) + } +} + +func TestHealthConfigSerialization(t *testing.T) { + // Test that health config can be properly marshaled/unmarshaled + health := &Health{ + Disabled: false, + EventTypes: []string{"EventTypeXidCriticalError", "EventTypeDoubleBitEccError"}, + IgnoredXIDs: []uint64{13, 31, 43}, + CriticalXIDs: &CriticalXIDsType{ + All: true, + }, + } + + // Test JSON marshaling + jsonData, err := json.Marshal(health) + require.NoError(t, err) + + var healthFromJSON Health + err = json.Unmarshal(jsonData, &healthFromJSON) + require.NoError(t, err) + require.Equal(t, health, &healthFromJSON) + + // Test YAML marshaling + yamlData, err := yaml.Marshal(health) + require.NoError(t, err) + + var healthFromYAML Health + err = yaml.Unmarshal(yamlData, &healthFromYAML) + require.NoError(t, err) + require.Equal(t, health, &healthFromYAML) +} + +func TestHealthConfigExample(t *testing.T) { + // Test parsing a YAML configuration similar to what would be in a config file + yamlConfig := ` +disabled: false +eventTypes: + - EventTypeXidCriticalError + - EventTypeDoubleBitEccError +ignoredXIDs: + - 13 + - 31 + - 43 +criticalXIDs: all +` + var health Health + err := yaml.Unmarshal([]byte(yamlConfig), &health) + require.NoError(t, err) + + require.False(t, health.Disabled) + require.Len(t, health.EventTypes, 2) + require.Len(t, health.IgnoredXIDs, 3) + require.NotNil(t, health.CriticalXIDs) + require.True(t, health.CriticalXIDs.All) + + // Test GKE-style config + gkeYamlConfig := ` +eventTypes: + - EventTypeXidCriticalError +criticalXIDs: [48] +` + var gkeHealth Health + err = yaml.Unmarshal([]byte(gkeYamlConfig), &gkeHealth) + require.NoError(t, err) + + require.False(t, gkeHealth.Disabled) + require.Len(t, gkeHealth.EventTypes, 1) + require.Empty(t, gkeHealth.IgnoredXIDs) + require.NotNil(t, gkeHealth.CriticalXIDs) + require.False(t, gkeHealth.CriticalXIDs.All) + require.Equal(t, []uint64{48}, gkeHealth.CriticalXIDs.XIDs) +} diff --git a/internal/rm/health.go b/internal/rm/health.go index a1989cd88..223eb8a71 100644 --- a/internal/rm/health.go +++ b/internal/rm/health.go @@ -24,6 +24,8 @@ import ( "github.com/NVIDIA/go-nvml/pkg/nvml" "k8s.io/klog/v2" + + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" ) const ( @@ -32,16 +34,15 @@ const ( // disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that // this is in addition to the Application errors that are already ignored. envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS" - allHealthChecks = "xids" ) // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error { - disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) - if disableHealthChecks == "all" { - disableHealthChecks = allHealthChecks - } - if strings.Contains(disableHealthChecks, "xids") { + // Apply backward compatibility with environment variable if health config is not set + healthConfig := r.getHealthConfig() + + if healthConfig.Disabled { + klog.Info("Health checks are disabled via configuration") return nil } @@ -59,25 +60,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } }() - // FIXME: formalize the full list and document it. - // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 - // Application errors: the GPU should still be healthy - ignoredXids := []uint64{ - 13, // Graphics Engine Exception - 31, // GPU memory page fault - 43, // GPU stopped processing - 45, // Preemptive cleanup, due to previous errors - 68, // Video processor exception - 109, // Context Switch Timeout Error - } - - skippedXids := make(map[uint64]bool) - for _, id := range ignoredXids { - skippedXids[id] = true - } - - for _, additionalXid := range getAdditionalXids(disableHealthChecks) { - skippedXids[additionalXid] = true + // Build the event mask from configuration + eventMask, err := buildEventMask(healthConfig.EventTypes) + if err != nil { + return fmt.Errorf("failed to build event mask: %v", err) } eventSet, ret := r.nvml.EventSetCreate() @@ -92,7 +78,6 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic deviceIDToGiMap := make(map[string]uint32) deviceIDToCiMap := make(map[string]uint32) - eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) for _, d := range devices { uuid, gi, ci, err := r.getDevicePlacement(d) if err != nil { @@ -147,13 +132,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic continue } + // Only process XID critical error events if e.EventType != nvml.EventTypeXidCriticalError { - klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e) + klog.Infof("Skipping non-XidCriticalError event: %+v", e) continue } - if skippedXids[e.EventData] { - klog.Infof("Skipping event %+v", e) + // Check if this XID should be treated as critical based on configuration + if !healthConfig.IsCritical(e.EventData) { + klog.Infof("Skipping non-critical XID %d based on health configuration", e.EventData) continue } @@ -188,29 +175,99 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } } -// getAdditionalXids returns a list of additional Xids to skip from the specified string. -// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values -// are ignored. -func getAdditionalXids(input string) []uint64 { +// getHealthConfig returns the health configuration, applying backward compatibility +// with the DP_DISABLE_HEALTHCHECKS environment variable if needed. +func (r *nvmlResourceManager) getHealthConfig() *spec.Health { + envDisableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) + + // If health config is explicitly set, use it + if r.config.Health != nil { + if envDisableHealthChecks != "" { + // Clone the config to avoid modifying the original + healthConfig := *r.config.Health + + if envDisableHealthChecks == "all" || strings.Contains(envDisableHealthChecks, "xids") { + healthConfig.Disabled = true + klog.Info("Health checks disabled via DP_DISABLE_HEALTHCHECKS environment variable (backward compatibility)") + } else { + // Parse additional XIDs to ignore from environment variable + additionalIgnored := parseXidsFromEnv(envDisableHealthChecks) + if len(additionalIgnored) > 0 { + healthConfig.IgnoredXIDs = append(healthConfig.IgnoredXIDs, additionalIgnored...) + klog.Infof("Adding XIDs %v to ignored list from DP_DISABLE_HEALTHCHECKS environment variable", additionalIgnored) + } + } + return &healthConfig + } + return r.config.Health + } + + // Fallback to default configuration with environment variable support + healthConfig := spec.DefaultHealth() + + // Apply environment variable for backward compatibility + if envDisableHealthChecks == "all" || strings.Contains(envDisableHealthChecks, "xids") { + healthConfig.Disabled = true + klog.Info("Health checks disabled via DP_DISABLE_HEALTHCHECKS environment variable") + } else if envDisableHealthChecks != "" { + additionalIgnored := parseXidsFromEnv(envDisableHealthChecks) + if len(additionalIgnored) > 0 { + healthConfig.IgnoredXIDs = append(healthConfig.IgnoredXIDs, additionalIgnored...) + klog.Infof("Adding XIDs %v to ignored list from DP_DISABLE_HEALTHCHECKS environment variable", additionalIgnored) + } + } + + return healthConfig +} + +// parseXidsFromEnv parses a comma-separated list of XIDs from an environment variable value +func parseXidsFromEnv(input string) []uint64 { if input == "" { return nil } - var additionalXids []uint64 - for _, additionalXid := range strings.Split(input, ",") { - trimmed := strings.TrimSpace(additionalXid) + var xids []uint64 + for _, xidStr := range strings.Split(input, ",") { + trimmed := strings.TrimSpace(xidStr) if trimmed == "" { continue } xid, err := strconv.ParseUint(trimmed, 10, 64) if err != nil { - klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err) + klog.Infof("Ignoring malformed XID value %v: %v", trimmed, err) continue } - additionalXids = append(additionalXids, xid) + xids = append(xids, xid) + } + return xids +} + +// buildEventMask converts event type strings from configuration to NVML event mask +func buildEventMask(eventTypes []string) (uint64, error) { + if len(eventTypes) == 0 { + // Default to all event types if none specified + return uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError), nil + } + + var eventMask uint64 + for _, eventType := range eventTypes { + switch eventType { + case "EventTypeXidCriticalError": + eventMask |= uint64(nvml.EventTypeXidCriticalError) + case "EventTypeDoubleBitEccError": + eventMask |= uint64(nvml.EventTypeDoubleBitEccError) + case "EventTypeSingleBitEccError": + eventMask |= uint64(nvml.EventTypeSingleBitEccError) + default: + return 0, fmt.Errorf("unknown event type: %s", eventType) + } + } + + if eventMask == 0 { + return 0, fmt.Errorf("no valid event types specified") } - return additionalXids + return eventMask, nil } // getDevicePlacement returns the placement of the specified device. diff --git a/internal/rm/health_config_test.go b/internal/rm/health_config_test.go new file mode 100644 index 000000000..408b896a2 --- /dev/null +++ b/internal/rm/health_config_test.go @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "os" + "testing" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/stretchr/testify/require" + + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" +) + +func TestGetHealthConfig(t *testing.T) { + testCases := []struct { + name string + config *spec.Config + envVar string + expectedResult func(*spec.Health) bool + }{ + { + name: "config with health settings", + config: &spec.Config{ + Health: &spec.Health{ + Disabled: false, + EventTypes: []string{"EventTypeXidCriticalError"}, + IgnoredXIDs: []uint64{13, 31}, + CriticalXIDs: &spec.CriticalXIDsType{ + All: true, + }, + }, + }, + envVar: "", + expectedResult: func(h *spec.Health) bool { + return !h.Disabled && len(h.IgnoredXIDs) == 2 + }, + }, + { + name: "config with env var override to disable", + config: &spec.Config{ + Health: &spec.Health{ + Disabled: false, + }, + }, + envVar: "all", + expectedResult: func(h *spec.Health) bool { + return h.Disabled + }, + }, + { + name: "config with env var adding ignored XIDs", + config: &spec.Config{ + Health: &spec.Health{ + Disabled: false, + IgnoredXIDs: []uint64{13}, + }, + }, + envVar: "48,79", + expectedResult: func(h *spec.Health) bool { + return !h.Disabled && len(h.IgnoredXIDs) == 3 + }, + }, + { + name: "no config with default health", + config: &spec.Config{}, + envVar: "", + expectedResult: func(h *spec.Health) bool { + // Should use default health config + return !h.Disabled && len(h.EventTypes) == 3 && len(h.IgnoredXIDs) == 6 + }, + }, + { + name: "no config with env var disable", + config: &spec.Config{}, + envVar: "xids", + expectedResult: func(h *spec.Health) bool { + return h.Disabled + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Set environment variable if specified + if tc.envVar != "" { + os.Setenv(envDisableHealthChecks, tc.envVar) + defer os.Unsetenv(envDisableHealthChecks) + } + + r := &nvmlResourceManager{ + resourceManager: resourceManager{ + config: tc.config, + }, + } + + health := r.getHealthConfig() + require.NotNil(t, health) + require.True(t, tc.expectedResult(health), "Health config did not match expected result") + }) + } +} + +func TestParseXidsFromEnv(t *testing.T) { + testCases := []struct { + name string + input string + expected []uint64 + }{ + { + name: "empty string", + input: "", + expected: nil, + }, + { + name: "single XID", + input: "48", + expected: []uint64{48}, + }, + { + name: "multiple XIDs", + input: "48,79,94", + expected: []uint64{48, 79, 94}, + }, + { + name: "XIDs with spaces", + input: " 48 , 79 , 94 ", + expected: []uint64{48, 79, 94}, + }, + { + name: "invalid XIDs ignored", + input: "48,invalid,79", + expected: []uint64{48, 79}, + }, + { + name: "mixed valid and invalid", + input: "48,,79,abc,94", + expected: []uint64{48, 79, 94}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := parseXidsFromEnv(tc.input) + require.Equal(t, tc.expected, result) + }) + } +} + +func TestBuildEventMask(t *testing.T) { + testCases := []struct { + name string + eventTypes []string + expected uint64 + expectError bool + }{ + { + name: "empty event types returns default", + eventTypes: []string{}, + expected: uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError), + expectError: false, + }, + { + name: "single event type", + eventTypes: []string{"EventTypeXidCriticalError"}, + expected: uint64(nvml.EventTypeXidCriticalError), + expectError: false, + }, + { + name: "multiple event types", + eventTypes: []string{"EventTypeXidCriticalError", "EventTypeDoubleBitEccError"}, + expected: uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError), + expectError: false, + }, + { + name: "all event types", + eventTypes: []string{ + "EventTypeXidCriticalError", + "EventTypeDoubleBitEccError", + "EventTypeSingleBitEccError", + }, + expected: uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError), + expectError: false, + }, + { + name: "invalid event type", + eventTypes: []string{"InvalidEventType"}, + expected: 0, + expectError: true, + }, + { + name: "mix of valid and invalid", + eventTypes: []string{"EventTypeXidCriticalError", "InvalidEventType"}, + expected: 0, + expectError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result, err := buildEventMask(tc.eventTypes) + if tc.expectError { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tc.expected, result) + } + }) + } +} + +func TestHealthConfigIntegration(t *testing.T) { + // Test that health config properly integrates with XID checking + testCases := []struct { + name string + health *spec.Health + xid uint64 + isCritical bool + }{ + { + name: "default config with ignored XID", + health: &spec.Health{ + IgnoredXIDs: []uint64{13, 31, 43}, + CriticalXIDs: &spec.CriticalXIDsType{ + All: true, + }, + }, + xid: 31, + isCritical: false, + }, + { + name: "default config with critical XID", + health: &spec.Health{ + IgnoredXIDs: []uint64{13, 31, 43}, + CriticalXIDs: &spec.CriticalXIDsType{ + All: true, + }, + }, + xid: 48, + isCritical: true, + }, + { + name: "GKE-style config with specific critical", + health: &spec.Health{ + IgnoredXIDs: []uint64{}, + CriticalXIDs: &spec.CriticalXIDsType{ + All: false, + XIDs: []uint64{48}, + }, + }, + xid: 48, + isCritical: true, + }, + { + name: "GKE-style config with non-critical", + health: &spec.Health{ + IgnoredXIDs: []uint64{}, + CriticalXIDs: &spec.CriticalXIDsType{ + All: false, + XIDs: []uint64{48}, + }, + }, + xid: 79, + isCritical: false, + }, + { + name: "disabled health checks", + health: &spec.Health{ + Disabled: true, + CriticalXIDs: &spec.CriticalXIDsType{ + All: true, + }, + }, + xid: 48, + isCritical: false, // Nothing is critical when disabled + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := tc.health.IsCritical(tc.xid) + require.Equal(t, tc.isCritical, result) + }) + } +} + +func TestBackwardCompatibility(t *testing.T) { + // Test that the new system maintains backward compatibility with DP_DISABLE_HEALTHCHECKS + + t.Run("env var 'all' disables health checks", func(t *testing.T) { + os.Setenv(envDisableHealthChecks, "all") + defer os.Unsetenv(envDisableHealthChecks) + + r := &nvmlResourceManager{ + resourceManager: resourceManager{ + config: &spec.Config{ + Health: &spec.Health{ + Disabled: false, + }, + }, + }, + } + + health := r.getHealthConfig() + require.True(t, health.Disabled) + }) + + t.Run("env var 'xids' disables health checks", func(t *testing.T) { + os.Setenv(envDisableHealthChecks, "xids") + defer os.Unsetenv(envDisableHealthChecks) + + r := &nvmlResourceManager{ + resourceManager: resourceManager{ + config: &spec.Config{}, + }, + } + + health := r.getHealthConfig() + require.True(t, health.Disabled) + }) + + t.Run("env var with XIDs adds to ignored list", func(t *testing.T) { + os.Setenv(envDisableHealthChecks, "48,79") + defer os.Unsetenv(envDisableHealthChecks) + + r := &nvmlResourceManager{ + resourceManager: resourceManager{ + config: &spec.Config{ + Health: &spec.Health{ + IgnoredXIDs: []uint64{13}, + }, + }, + }, + } + + health := r.getHealthConfig() + require.False(t, health.Disabled) + require.Contains(t, health.IgnoredXIDs, uint64(13)) + require.Contains(t, health.IgnoredXIDs, uint64(48)) + require.Contains(t, health.IgnoredXIDs, uint64(79)) + }) +} diff --git a/internal/rm/health_test.go b/internal/rm/health_test.go index 101aadf78..42c6882ad 100644 --- a/internal/rm/health_test.go +++ b/internal/rm/health_test.go @@ -23,7 +23,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestGetAdditionalXids(t *testing.T) { +func TestParseXidsFromEnvLegacy(t *testing.T) { testCases := []struct { input string expected []uint64 @@ -66,7 +66,7 @@ func TestGetAdditionalXids(t *testing.T) { for i, tc := range testCases { t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { - xids := getAdditionalXids(tc.input) + xids := parseXidsFromEnv(tc.input) require.EqualValues(t, tc.expected, xids) })