-
Notifications
You must be signed in to change notification settings - Fork 99
Add GPU health check #689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add GPU health check #689
Changes from all commits
0e5dd5e
a7c85c7
896ef3a
599fb15
ae7211e
2d7618d
1a950ca
7d501e4
53d57f5
32bd655
f6ba4c8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,342 @@ | ||
| /* | ||
| * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package main | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strconv" | ||
| "strings" | ||
| "sync" | ||
|
|
||
| "github.com/NVIDIA/go-nvml/pkg/nvml" | ||
| "k8s.io/klog/v2" | ||
| ) | ||
|
|
||
| const ( | ||
| FullGPUInstanceID uint32 = 0xFFFFFFFF | ||
| ) | ||
|
|
||
| // For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>. | ||
| // For a full device the returned 3-tuple is the device's uuid and (FullGPUInstanceID) 0xFFFFFFFF for the other two elements. | ||
| type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice | ||
|
|
||
| type nvmlDeviceHealthMonitor struct { | ||
| nvmllib nvml.Interface | ||
| eventSet nvml.EventSet | ||
| unhealthy chan *AllocatableDevice | ||
| cancelContext context.CancelFunc | ||
| deviceByPlacement devicePlacementMap | ||
| skippedXids map[uint64]bool | ||
| wg sync.WaitGroup | ||
| } | ||
|
|
||
| func newNvmlDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) { | ||
| if nvdevlib.nvmllib == nil { | ||
| return nil, fmt.Errorf("nvml library is nil") | ||
| } | ||
| if ret := nvdevlib.nvmllib.Init(); ret != nvml.SUCCESS { | ||
| return nil, fmt.Errorf("failed to initialize NVML: %v", ret) | ||
| } | ||
| defer func() { | ||
| _ = nvdevlib.nvmllib.Shutdown() | ||
| }() | ||
|
|
||
| m := &nvmlDeviceHealthMonitor{ | ||
| nvmllib: nvdevlib.nvmllib, | ||
| unhealthy: make(chan *AllocatableDevice, len(allocatable)), | ||
| deviceByPlacement: getDevicePlacementMap(allocatable), | ||
| skippedXids: xidsToSkip(config.flags.additionalXidsToIgnore), | ||
| } | ||
| return m, nil | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) Start(ctx context.Context) (err error) { | ||
| if ret := m.nvmllib.Init(); ret != nvml.SUCCESS { | ||
| return fmt.Errorf("failed to initialize NVML: %v", ret) | ||
| } | ||
|
|
||
| defer func() { | ||
| if err != nil { | ||
| _ = m.nvmllib.Shutdown() | ||
| } | ||
| }() | ||
|
|
||
| klog.V(6).Info("creating NVML events for device health monitor") | ||
| eventSet, ret := m.nvmllib.EventSetCreate() | ||
| if ret != nvml.SUCCESS { | ||
| return fmt.Errorf("failed to create event set: %w", ret) | ||
| } | ||
|
|
||
| ctx, cancel := context.WithCancel(ctx) | ||
| m.cancelContext = cancel | ||
|
|
||
| m.eventSet = eventSet | ||
|
|
||
| klog.V(6).Info("registering NVML events for device health monitor") | ||
| m.registerEventsForDevices() | ||
|
|
||
| klog.V(6).Info("started device health monitoring") | ||
| m.wg.Add(1) | ||
| go m.run(ctx) | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() { | ||
| eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) | ||
|
|
||
| for parentUUID, giMap := range m.deviceByPlacement { | ||
| gpu, ret := m.nvmllib.DeviceGetHandleByUUID(parentUUID) | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", parentUUID, ret) | ||
| m.markAllSlicesUnhealthy(giMap) | ||
| continue | ||
| } | ||
|
|
||
| supportedEvents, ret := gpu.GetSupportedEventTypes() | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", parentUUID, ret) | ||
| m.markAllSlicesUnhealthy(giMap) | ||
| continue | ||
| } | ||
|
|
||
| ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet) | ||
| if ret == nvml.ERROR_NOT_SUPPORTED { | ||
| klog.Warningf("Device %v is too old to support healthchecking.", parentUUID) | ||
| } | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("unable to register events for %s: %v; marking it as unhealthy", parentUUID, ret) | ||
| m.markAllSlicesUnhealthy(giMap) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) Stop() { | ||
| if m == nil { | ||
| return | ||
| } | ||
| klog.V(6).Info("stopping health monitor") | ||
|
|
||
| if m.cancelContext != nil { | ||
| m.cancelContext() | ||
| } | ||
|
|
||
| m.wg.Wait() | ||
|
|
||
| if ret := m.eventSet.Free(); ret != nvml.SUCCESS { | ||
| klog.Warningf("failed to unset events: %v", ret) | ||
| } | ||
|
|
||
| if ret := m.nvmllib.Shutdown(); ret != nvml.SUCCESS { | ||
| klog.Warningf("failed to shutdown NVML: %v", ret) | ||
| } | ||
| close(m.unhealthy) | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) { | ||
| defer m.wg.Done() | ||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| klog.V(6).Info("Stopping event-driven GPU health monitor...") | ||
| return | ||
| default: | ||
| event, ret := m.eventSet.Wait(5000) // timeout in 5000 ms. | ||
| if ret == nvml.ERROR_TIMEOUT { | ||
| continue | ||
| } | ||
| // not all return errors are handled as currently there is no proper way to process these errors other than marking all devices healthy. | ||
| // Ref doc: [https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c]. | ||
| if ret != nvml.SUCCESS { | ||
| if ret == nvml.ERROR_GPU_IS_LOST { | ||
| klog.Infof("GPU is lost error: %v; Marking all devices as unhealthy", ret) | ||
| m.markAllDevicesUnhealthy() | ||
| continue | ||
| } | ||
| klog.Warningf("Error waiting for NVML event: %v. Retrying...", ret) | ||
| continue | ||
| } | ||
|
|
||
| // To-do: check why other supported types are not considered? | ||
| eType := event.EventType | ||
| eData := event.EventData | ||
| eGI := event.GpuInstanceId | ||
| eCI := event.ComputeInstanceId | ||
| if eType != nvml.EventTypeXidCriticalError { | ||
| klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: Data=%d, Type=%d, GI=%d, CI=%d", eData, eType, eGI, eCI) | ||
| continue | ||
| } | ||
|
|
||
| if m.skippedXids[eData] { | ||
| klog.Infof("Skipping XID event: Data=%d, Type=%d, GI=%d, CI=%d", eData, eType, eGI, eCI) | ||
| continue | ||
| } | ||
|
|
||
| klog.Infof("Processing event XID=%d event", eData) | ||
| // this seems an extreme action. | ||
| // should we just log the error and proceed anyway. | ||
| // To-do: look into how to properly handle this error. | ||
| eventUUID, ret := event.Device.GetUUID() | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret) | ||
| m.markAllDevicesUnhealthy() | ||
| continue | ||
| } | ||
| affectedDevice := m.deviceByPlacement.get(eventUUID, eGI, eCI) | ||
| if affectedDevice == nil { | ||
| klog.Infof("Ignoring event for unexpected device (UUID:%s, GI:%d, CI:%d)", eventUUID, eGI, eCI) | ||
| continue | ||
| } | ||
|
|
||
| klog.Infof("Sending unhealthy notification for device %s due to event type:%v and event data:%d", affectedDevice.UUID(), eType, eData) | ||
| m.unhealthy <- affectedDevice | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice { | ||
| return m.unhealthy | ||
| } | ||
|
|
||
| func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() { | ||
| for _, giMap := range m.deviceByPlacement { | ||
| m.markAllSlicesUnhealthy(giMap) | ||
| } | ||
| } | ||
|
|
||
| // markAllSlicesUnhealthy is a helper function to mark every mig device under a parent as unhealthy. | ||
| func (m *nvmlDeviceHealthMonitor) markAllSlicesUnhealthy(giMap map[uint32]map[uint32]*AllocatableDevice) { | ||
| for _, ciMap := range giMap { | ||
| for _, dev := range ciMap { | ||
| // Non-blocking send to avoid deadlocks if channel is full. | ||
| select { | ||
| case m.unhealthy <- dev: | ||
| klog.V(6).Infof("Marked device %s as unhealthy", dev.UUID()) | ||
| default: | ||
| klog.Errorf("Unhealthy channel full. Dropping unhealthy notification for device %s", dev.UUID()) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func getDevicePlacementMap(allocatable AllocatableDevices) devicePlacementMap { | ||
| placementMap := make(devicePlacementMap) | ||
|
|
||
| for _, d := range allocatable { | ||
| var parentUUID string | ||
| var giID, ciID uint32 | ||
|
|
||
| switch d.Type() { | ||
| case GpuDeviceType: | ||
| parentUUID = d.UUID() | ||
| if parentUUID == "" { | ||
| continue | ||
| } | ||
| giID = FullGPUInstanceID | ||
| ciID = FullGPUInstanceID | ||
|
|
||
| case MigDeviceType: | ||
| parentUUID = d.Mig.parent.UUID | ||
| if parentUUID == "" { | ||
| continue | ||
| } | ||
| giID = d.Mig.giInfo.Id | ||
| ciID = d.Mig.ciInfo.Id | ||
|
|
||
| default: | ||
| klog.Errorf("Skipping device with unknown type: %s", d.Type()) | ||
| continue | ||
| } | ||
| placementMap.addDevice(parentUUID, giID, ciID, d) | ||
| } | ||
| return placementMap | ||
| } | ||
|
|
||
| func (p devicePlacementMap) addDevice(parentUUID string, giID uint32, ciID uint32, d *AllocatableDevice) { | ||
| if _, ok := p[parentUUID]; !ok { | ||
| p[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice) | ||
| } | ||
| if _, ok := p[parentUUID][giID]; !ok { | ||
| p[parentUUID][giID] = make(map[uint32]*AllocatableDevice) | ||
| } | ||
| p[parentUUID][giID][ciID] = d | ||
| } | ||
|
|
||
| func (p devicePlacementMap) get(uuid string, gi, ci uint32) *AllocatableDevice { | ||
| giMap, ok := p[uuid] | ||
| if !ok { | ||
| return nil | ||
| } | ||
|
|
||
| ciMap, ok := giMap[gi] | ||
| if !ok { | ||
| return nil | ||
| } | ||
| return ciMap[ci] | ||
| } | ||
|
|
||
| // getAdditionalXids returns a list of additional Xids to skip from the specified string. | ||
| // The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. | ||
| // Invalid values nare ignored. | ||
| // To-do: add list of EXPLICIT XIDs from [https://github.com/NVIDIA/k8s-device-plugin/pull/1443]. | ||
| func getAdditionalXids(input string) []uint64 { | ||
| if input == "" { | ||
| return nil | ||
| } | ||
|
|
||
| var additionalXids []uint64 | ||
| klog.V(6).Infof("Creating a list of additional xids to ignore: [%s]", input) | ||
| for _, additionalXid := range strings.Split(input, ",") { | ||
| trimmed := strings.TrimSpace(additionalXid) | ||
| if trimmed == "" { | ||
| continue | ||
| } | ||
| xid, err := strconv.ParseUint(trimmed, 10, 64) | ||
| if err != nil { | ||
| klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err) | ||
| continue | ||
| } | ||
| additionalXids = append(additionalXids, xid) | ||
| } | ||
|
|
||
| return additionalXids | ||
| } | ||
|
|
||
| func xidsToSkip(additionalXids string) map[uint64]bool { | ||
| // Add the list of hardcoded disabled (ignored) XIDs: | ||
| // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 | ||
| // Application errors: the GPU should still be healthy. | ||
| ignoredXids := []uint64{ | ||
| 13, // Graphics Engine Exception | ||
| 31, // GPU memory page fault | ||
| 43, // GPU stopped processing | ||
| 45, // Preemptive cleanup, due to previous errors | ||
| 68, // Video processor exception | ||
| 109, // Context Switch Timeout Error | ||
guptaNswati marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
Comment on lines
+321
to
+331
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know that this list is taken from the device plugin, but handling them explicity at such a low-level is quite difficult to customize. Does it make sense to not port this logic over by instead define these as the default value for the envvar where we expose this to the user? (I'm happy to do this as a follow-up though). Just as a note for completeness. The GKE device plugin doesn't use this strategy for XIDs. They only element in their list is XID48 (see https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/0509b1f9f4b9a357b44ba65e7b508ded8bd5ecf0/pkg/gpu/nvidia/health_check/health_checker.go#L59). |
||
|
|
||
| skippedXids := make(map[uint64]bool) | ||
| for _, id := range ignoredXids { | ||
| skippedXids[id] = true | ||
| } | ||
|
|
||
| for _, additionalXid := range getAdditionalXids(additionalXids) { | ||
| skippedXids[additionalXid] = true | ||
| } | ||
| return skippedXids | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems bit aggressive to mark all devices as unhealthy on one invalid event. Should we log this as error and continue watch? cc @klueska
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its how its done in device-plugin https://github.com/NVIDIA/k8s-device-plugin/blob/main/internal/rm/health.go#L147
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd also say we should log an error and otherwise proceed. Even if what you've shown here is currently done in the device plugin.
By the way, this would have been a perfect opportunity for a better code comment in the legacy code:
No blame, no emotions -- but this code comment does not add information in addition to the code. The interesting bit would be if there is a specific, non-obvious reason / relevance for this style of treatment.
For example, I wonder if this code was introduced to fix a bug. I wonder if it is even ever exercised.
The way it's written and with the git blame history, it seems like it was potentially added initially (defensively) and may never have been exercised in production.