Skip to content

Commit 7bcab30

Browse files
authored
Merge pull request #1443 from elezar/robertdavidsmith/main
Add support for explicitly enabling XIDs in health checks
2 parents cf4fe80 + 0900fac commit 7bcab30

File tree

2 files changed

+265
-53
lines changed

2 files changed

+265
-53
lines changed

internal/rm/health.go

Lines changed: 103 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,18 @@ const (
3232
// disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that
3333
// this is in addition to the Application errors that are already ignored.
3434
envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS"
35-
allHealthChecks = "xids"
35+
// envEnableHealthChecks defines the environment variable that is checked to
36+
// determine which XIDs should be explicitly enabled. XIDs specified here
37+
// override the ones specified in the `DP_DISABLE_HEALTHCHECKS`.
38+
// Note that this also allows individual XIDs to be selected when ALL XIDs
39+
// are disabled.
40+
envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS"
3641
)
3742

3843
// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
3944
func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error {
40-
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
41-
if disableHealthChecks == "all" {
42-
disableHealthChecks = allHealthChecks
43-
}
44-
if strings.Contains(disableHealthChecks, "xids") {
45+
xids := getHealthCheckXids()
46+
if xids.IsAllDisabled() {
4547
return nil
4648
}
4749

@@ -59,26 +61,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
5961
}
6062
}()
6163

62-
// FIXME: formalize the full list and document it.
63-
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
64-
// Application errors: the GPU should still be healthy
65-
ignoredXids := []uint64{
66-
13, // Graphics Engine Exception
67-
31, // GPU memory page fault
68-
43, // GPU stopped processing
69-
45, // Preemptive cleanup, due to previous errors
70-
68, // Video processor exception
71-
109, // Context Switch Timeout Error
72-
}
73-
74-
skippedXids := make(map[uint64]bool)
75-
for _, id := range ignoredXids {
76-
skippedXids[id] = true
77-
}
78-
79-
for _, additionalXid := range getAdditionalXids(disableHealthChecks) {
80-
skippedXids[additionalXid] = true
81-
}
64+
klog.Infof("Using XIDs for health checks: %v", xids)
8265

8366
eventSet, ret := r.nvml.EventSetCreate()
8467
if ret != nvml.SUCCESS {
@@ -152,7 +135,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
152135
continue
153136
}
154137

155-
if skippedXids[e.EventData] {
138+
if xids.IsDisabled(e.EventData) {
156139
klog.Infof("Skipping event %+v", e)
157140
continue
158141
}
@@ -188,29 +171,109 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
188171
}
189172
}
190173

191-
// getAdditionalXids returns a list of additional Xids to skip from the specified string.
192-
// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values
193-
// are ignored.
194-
func getAdditionalXids(input string) []uint64 {
195-
if input == "" {
196-
return nil
174+
const allXIDs = 0
175+
176+
// disabledXIDs stores a map of explicitly disabled XIDs.
177+
// The special XID `allXIDs` indicates that all XIDs are disabled, but does
178+
// allow for specific XIDs to be enabled even if this is the case.
179+
type disabledXIDs map[uint64]bool
180+
181+
// Disabled returns whether XID-based health checks are disabled.
182+
// These are considered if all XIDs have been disabled AND no other XIDs have
183+
// been explcitly enabled.
184+
func (h disabledXIDs) IsAllDisabled() bool {
185+
if allDisabled, ok := h[allXIDs]; ok {
186+
return allDisabled
197187
}
188+
// At this point we wither have explicitly disabled XIDs or explicitly
189+
// enabled XIDs. Since ANY XID that's not specified is assumed enabled, we
190+
// return here.
191+
return false
192+
}
198193

199-
var additionalXids []uint64
200-
for _, additionalXid := range strings.Split(input, ",") {
201-
trimmed := strings.TrimSpace(additionalXid)
194+
// IsDisabled checks whether the specified XID has been explicitly disalbled.
195+
// An XID is considered disabled if it has been explicitly disabled, or all XIDs
196+
// have been disabled.
197+
func (h disabledXIDs) IsDisabled(xid uint64) bool {
198+
// Handle the case where enabled=all.
199+
if explicitAll, ok := h[allXIDs]; ok && !explicitAll {
200+
return false
201+
}
202+
// Handle the case where the XID has been specifically enabled (or disabled)
203+
if disabled, ok := h[xid]; ok {
204+
return disabled
205+
}
206+
return h.IsAllDisabled()
207+
}
208+
209+
// getHealthCheckXids returns the XIDs that are considered fatal.
210+
// Here we combine the following (in order of precedence):
211+
// * A list of explicitly disabled XIDs (including all XIDs)
212+
// * A list of hardcoded disabled XIDs
213+
// * A list of explicitly enabled XIDs (including all XIDs)
214+
//
215+
// Note that if an XID is explicitly enabled, this takes precedence over it
216+
// having been disabled either explicitly or implicitly.
217+
func getHealthCheckXids() disabledXIDs {
218+
disabled := newHealthCheckXIDs(
219+
// TODO: We should not read the envvar here directly, but instead
220+
// "upgrade" this to a top-level config option.
221+
strings.Split(strings.ToLower(os.Getenv(envDisableHealthChecks)), ",")...,
222+
)
223+
enabled := newHealthCheckXIDs(
224+
// TODO: We should not read the envvar here directly, but instead
225+
// "upgrade" this to a top-level config option.
226+
strings.Split(strings.ToLower(os.Getenv(envEnableHealthChecks)), ",")...,
227+
)
228+
229+
// Add the list of hardcoded disabled (ignored) XIDs:
230+
// FIXME: formalize the full list and document it.
231+
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
232+
// Application errors: the GPU should still be healthy
233+
ignoredXids := []uint64{
234+
13, // Graphics Engine Exception
235+
31, // GPU memory page fault
236+
43, // GPU stopped processing
237+
45, // Preemptive cleanup, due to previous errors
238+
68, // Video processor exception
239+
109, // Context Switch Timeout Error
240+
}
241+
for _, ignored := range ignoredXids {
242+
disabled[ignored] = true
243+
}
244+
245+
// Explicitly ENABLE specific XIDs,
246+
for enabled := range enabled {
247+
disabled[enabled] = false
248+
}
249+
return disabled
250+
}
251+
252+
// newHealthCheckXIDs converts a list of Xids to a healthCheckXIDs map.
253+
// Special xid values 'all' and 'xids' return a special map that matches all
254+
// xids.
255+
// For other xids, these are converted to a uint64 values with invalid values
256+
// being ignored.
257+
func newHealthCheckXIDs(xids ...string) disabledXIDs {
258+
output := make(disabledXIDs)
259+
for _, xid := range xids {
260+
trimmed := strings.TrimSpace(xid)
261+
if trimmed == "all" || trimmed == "xids" {
262+
// TODO: We should have a different type for "all" and "all-except"
263+
return disabledXIDs{allXIDs: true}
264+
}
202265
if trimmed == "" {
203266
continue
204267
}
205-
xid, err := strconv.ParseUint(trimmed, 10, 64)
268+
id, err := strconv.ParseUint(trimmed, 10, 64)
206269
if err != nil {
207270
klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err)
208271
continue
209272
}
210-
additionalXids = append(additionalXids, xid)
211-
}
212273

213-
return additionalXids
274+
output[id] = true
275+
}
276+
return output
214277
}
215278

216279
// getDevicePlacement returns the placement of the specified device.

0 commit comments

Comments
 (0)