-
Notifications
You must be signed in to change notification settings - Fork 99
Add GPU health check #689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add GPU health check #689
Changes from 2 commits
0e5dd5e
a7c85c7
896ef3a
599fb15
ae7211e
2d7618d
1a950ca
7d501e4
53d57f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,281 @@ | ||
| /* | ||
| * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | ||
guptaNswati marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package main | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strconv" | ||
| "strings" | ||
| "sync" | ||
|
|
||
| "github.com/NVIDIA/go-nvml/pkg/nvml" | ||
| "k8s.io/klog/v2" | ||
| ) | ||
|
|
||
| const ( | ||
| FullGPUInstanceID uint32 = 0xFFFFFFFF | ||
| ) | ||
|
|
||
| type deviceHealthMonitor struct { | ||
elezar marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| nvmllib nvml.Interface | ||
| eventSet nvml.EventSet | ||
| unhealthy chan *AllocatableDevice | ||
| stop chan struct{} | ||
| uuidToDeviceMap map[string]*AllocatableDevice | ||
| wg sync.WaitGroup | ||
| } | ||
|
|
||
| func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*deviceHealthMonitor, error) { | ||
|
||
| if nvdevlib.nvmllib == nil { | ||
| return nil, fmt.Errorf("nvml library is nil") | ||
| } | ||
|
|
||
| m := &deviceHealthMonitor{ | ||
| nvmllib: nvdevlib.nvmllib, | ||
| unhealthy: make(chan *AllocatableDevice, len(allocatable)), | ||
| stop: make(chan struct{}), | ||
| } | ||
|
|
||
| if ret := m.nvmllib.Init(); ret != nvml.SUCCESS { | ||
| return nil, fmt.Errorf("failed to initialize NVML: %v", ret) | ||
| } | ||
|
|
||
| klog.V(6).Info("creating NVML events for device health monitor") | ||
| eventSet, ret := m.nvmllib.EventSetCreate() | ||
| if ret != nvml.SUCCESS { | ||
| _ = m.nvmllib.Shutdown() | ||
| return nil, fmt.Errorf("failed to create event set: %w", ret) | ||
| } | ||
| m.eventSet = eventSet | ||
|
|
||
| m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable) | ||
elezar marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| klog.V(6).Info("registering NVML events for device health monitor") | ||
| m.registerDevicesForEvents() | ||
|
|
||
| skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore) | ||
|
||
| klog.V(6).Info("started device health monitoring") | ||
| m.wg.Add(1) | ||
| go m.run(skippedXids) | ||
guptaNswati marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| return m, nil | ||
| } | ||
|
|
||
| func (m *deviceHealthMonitor) registerDevicesForEvents() { | ||
guptaNswati marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) | ||
|
|
||
| processedUUIDs := make(map[string]bool) | ||
|
|
||
| for uuid, dev := range m.uuidToDeviceMap { | ||
| var u string | ||
| if dev.Type() == MigDeviceType { | ||
| u = dev.Mig.parent.UUID | ||
| } else { | ||
| u = uuid | ||
| } | ||
|
|
||
| if processedUUIDs[u] { | ||
| continue | ||
| } | ||
| gpu, ret := m.nvmllib.DeviceGetHandleByUUID(u) | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", u, ret) | ||
| m.unhealthy <- dev | ||
| continue | ||
| } | ||
|
|
||
| supportedEvents, ret := gpu.GetSupportedEventTypes() | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", u, ret) | ||
| m.unhealthy <- dev | ||
| continue | ||
| } | ||
|
|
||
| ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet) | ||
| if ret == nvml.ERROR_NOT_SUPPORTED { | ||
| klog.Warningf("Device %v is too old to support healthchecking.", u) | ||
| } | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("unable to register events for %s: %v; marking it as unhealthy", u, ret) | ||
| m.unhealthy <- dev | ||
| } | ||
| processedUUIDs[u] = true | ||
| } | ||
| } | ||
|
|
||
| func (m *deviceHealthMonitor) Stop() { | ||
| if m == nil { | ||
| return | ||
| } | ||
| klog.V(6).Info("stopping health monitor") | ||
|
|
||
| close(m.stop) | ||
| m.wg.Wait() | ||
|
|
||
| _ = m.eventSet.Free() | ||
|
||
|
|
||
| if ret := m.nvmllib.Shutdown(); ret != nvml.SUCCESS { | ||
| klog.Warningf("failed to shutdown NVML: %v", ret) | ||
| } | ||
| close(m.unhealthy) | ||
| } | ||
|
|
||
| func getUUIDToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableDevice { | ||
| uuidToDeviceMap := make(map[string]*AllocatableDevice) | ||
|
|
||
| for _, d := range allocatable { | ||
| if u := d.UUID(); u != "" { | ||
| uuidToDeviceMap[u] = d | ||
| } | ||
| } | ||
| return uuidToDeviceMap | ||
| } | ||
|
|
||
| func (m *deviceHealthMonitor) run(skippedXids map[uint64]bool) { | ||
| defer m.wg.Done() | ||
| for { | ||
| select { | ||
| case <-m.stop: | ||
| klog.V(6).Info("Stopping event-driven GPU health monitor...") | ||
| return | ||
| default: | ||
| event, ret := m.eventSet.Wait(5000) | ||
|
||
| if ret == nvml.ERROR_TIMEOUT { | ||
| continue | ||
| } | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret) | ||
|
||
| for _, dev := range m.uuidToDeviceMap { | ||
guptaNswati marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| m.unhealthy <- dev | ||
shivamerla marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| continue | ||
| } | ||
|
|
||
| if event.EventType != nvml.EventTypeXidCriticalError { | ||
guptaNswati marked this conversation as resolved.
Show resolved
Hide resolved
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also for the device plugin) Can we track the follow-up action of checking why we don't check other supported types? Do whe have any indication of whether we ever see the log message below? |
||
| klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", event) | ||
| continue | ||
| } | ||
|
|
||
| if skippedXids[event.EventData] { | ||
| klog.Infof("Skipping event %+v", event) | ||
| continue | ||
| } | ||
|
|
||
| klog.Infof("Processing event %+v", event) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have an example for this log message, how it would look like in practice?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| eventUUID, ret := event.Device.GetUUID() | ||
| if ret != nvml.SUCCESS { | ||
| klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems bit aggressive to mark all devices as unhealthy on one invalid event. Should we log this as error and continue watch? cc @klueska
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its how its done in device-plugin https://github.com/NVIDIA/k8s-device-plugin/blob/main/internal/rm/health.go#L147
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd also say we should log an error and otherwise proceed. Even if what you've shown here is currently done in the device plugin. By the way, this would have been a perfect opportunity for a better code comment in the legacy code:
No blame, no emotions -- but this code comment does not add information in addition to the code. The interesting bit would be if there is a specific, non-obvious reason / relevance for this style of treatment. For example, I wonder if this code was introduced to fix a bug. I wonder if it is even ever exercised. The way it's written and with the git blame history, it seems like it was potentially added initially (defensively) and may never have been exercised in production. |
||
| for _, dev := range m.uuidToDeviceMap { | ||
| m.unhealthy <- dev | ||
| } | ||
| continue | ||
| } | ||
|
|
||
| var affectedDevice *AllocatableDevice | ||
| if event.GpuInstanceId != FullGPUInstanceID && event.ComputeInstanceId != FullGPUInstanceID { | ||
| affectedDevice = m.findMigDevice(eventUUID, event.GpuInstanceId, event.ComputeInstanceId) | ||
| klog.Infof("Event for mig device: %s", affectedDevice.UUID()) | ||
shivamerla marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } else { | ||
| affectedDevice = m.findGpuDevice(eventUUID) | ||
| klog.Infof("Event for device: %s", affectedDevice.UUID()) | ||
shivamerla marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| if affectedDevice == nil { | ||
| klog.Infof("Ignoring event for unexpected device (UUID: %s, GI: %d, CI: %d)", eventUUID, event.GpuInstanceId, event.ComputeInstanceId) | ||
| continue | ||
| } | ||
|
|
||
| klog.Infof("Sending unhealthy notification for device %s due to event type: %v and event data: %d", affectedDevice.UUID(), event.EventType, event.EventData) | ||
| m.unhealthy <- affectedDevice | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (m *deviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice { | ||
| return m.unhealthy | ||
| } | ||
|
|
||
| func (m *deviceHealthMonitor) findMigDevice(parentUUID string, giID uint32, ciID uint32) *AllocatableDevice { | ||
| for _, device := range m.uuidToDeviceMap { | ||
| if device.Type() != MigDeviceType { | ||
| continue | ||
| } | ||
|
|
||
| if device.Mig.parent.UUID == parentUUID && | ||
| device.Mig.giInfo.Id == giID && | ||
| device.Mig.ciInfo.Id == ciID { | ||
| return device | ||
| } | ||
| } | ||
| return nil | ||
guptaNswati marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func (m *deviceHealthMonitor) findGpuDevice(uuid string) *AllocatableDevice { | ||
| device, exists := m.uuidToDeviceMap[uuid] | ||
| if exists && device.Type() == GpuDeviceType { | ||
| return device | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // getAdditionalXids returns a list of additional Xids to skip from the specified string. | ||
| // The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. | ||
| // Invalid values nare ignored. | ||
| func getAdditionalXids(input string) []uint64 { | ||
| if input == "" { | ||
| return nil | ||
| } | ||
|
|
||
| var additionalXids []uint64 | ||
| klog.V(6).Infof("Creating a list of additional xids to ignore: [%s]", input) | ||
| for _, additionalXid := range strings.Split(input, ",") { | ||
| trimmed := strings.TrimSpace(additionalXid) | ||
| if trimmed == "" { | ||
| continue | ||
| } | ||
| xid, err := strconv.ParseUint(trimmed, 10, 64) | ||
| if err != nil { | ||
| klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err) | ||
| continue | ||
| } | ||
| additionalXids = append(additionalXids, xid) | ||
| } | ||
|
|
||
| return additionalXids | ||
| } | ||
|
|
||
| func (m *deviceHealthMonitor) xidsToSkip(additionalXids string) map[uint64]bool { | ||
guptaNswati marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ignoredXids := []uint64{ | ||
| 13, // Graphics Engine Exception | ||
| 31, // GPU memory page fault | ||
| 43, // GPU stopped processing | ||
| 45, // Preemptive cleanup, due to previous errors | ||
| 68, // Video processor exception | ||
| 109, // Context Switch Timeout Error | ||
guptaNswati marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| skippedXids := make(map[uint64]bool) | ||
| for _, id := range ignoredXids { | ||
| skippedXids[id] = true | ||
| } | ||
|
|
||
| for _, additionalXid := range getAdditionalXids(additionalXids) { | ||
| skippedXids[additionalXid] = true | ||
| } | ||
| return skippedXids | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -550,6 +550,14 @@ func GetOpaqueDeviceConfigs( | |
| return resultConfigs, nil | ||
| } | ||
|
|
||
| func (s *DeviceState) UpdateDeviceHealthStatus(device *AllocatableDevice, healthstatus string) { | ||
| s.Lock() | ||
| defer s.Unlock() | ||
|
|
||
| device.Health = healthstatus | ||
| klog.Infof("Update device sattus:%s healthstatus", device.UUID()) | ||
elezar marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // TODO: Dynamic MIG is not yet supported with structured parameters. | ||
| // Refactor this to allow for the allocation of statically partitioned MIG | ||
| // devices. | ||
|
|
||


Uh oh!
There was an error while loading. Please reload this page.