We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
2 parents 831c31e + f74a958 commit 23bc08aCopy full SHA for 23bc08a
internal/rm/health.go
@@ -62,16 +62,17 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
62
// FIXME: formalize the full list and document it.
63
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
64
// Application errors: the GPU should still be healthy
65
- applicationErrorXids := []uint64{
66
- 13, // Graphics Engine Exception
67
- 31, // GPU memory page fault
68
- 43, // GPU stopped processing
69
- 45, // Preemptive cleanup, due to previous errors
70
- 68, // Video processor exception
+ ignoredXids := []uint64{
+ 13, // Graphics Engine Exception
+ 31, // GPU memory page fault
+ 43, // GPU stopped processing
+ 45, // Preemptive cleanup, due to previous errors
+ 68, // Video processor exception
71
+ 109, // Context Switch Timeout Error
72
}
73
74
skippedXids := make(map[uint64]bool)
- for _, id := range applicationErrorXids {
75
+ for _, id := range ignoredXids {
76
skippedXids[id] = true
77
78
0 commit comments