Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions cmd/gpu-kubelet-plugin/allocatable.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,19 @@ import (
resourceapi "k8s.io/api/resource/v1"
)

const (
// Healthy means that the device is healthy
Healthy = "Healthy"
// Unhealthy means that the device is unhealthy
Unhealthy = "Unhealthy"
)

type AllocatableDevices map[string]*AllocatableDevice

type AllocatableDevice struct {
Gpu *GpuInfo
Mig *MigDeviceInfo
Gpu *GpuInfo
Mig *MigDeviceInfo
Health string // from device-plugin
}

func (d AllocatableDevice) Type() string {
Expand Down Expand Up @@ -69,6 +77,20 @@ func (d *AllocatableDevice) GetDevice() resourceapi.Device {
panic("unexpected type for AllocatableDevice")
}

func (d *AllocatableDevice) IsHealthy() bool {
return d.Health == Healthy
}

func (d *AllocatableDevice) GetUUID() string {
if d.Gpu != nil {
return d.Gpu.UUID
}
if d.Mig != nil {
return d.Mig.UUID
}
return ""
}

func (d AllocatableDevices) GpuUUIDs() []string {
var uuids []string
for _, device := range d {
Expand Down
280 changes: 280 additions & 0 deletions cmd/gpu-kubelet-plugin/device_health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"fmt"
"strconv"
"strings"
"sync"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"k8s.io/klog/v2"
)

const (
InstanceID uint32 = 0xFFFFFFFF
)

type deviceHealthMonitor struct {
nvmllib nvml.Interface
eventSet nvml.EventSet
unhealthy chan *AllocatableDevice
stop chan struct{}
uuidToDeviceMap map[string]*AllocatableDevice
wg sync.WaitGroup
}

func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*deviceHealthMonitor, error) {
if nvdevlib.nvmllib == nil {
return nil, fmt.Errorf("nvml library is nil")
}

m := &deviceHealthMonitor{
nvmllib: nvdevlib.nvmllib,
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
stop: make(chan struct{}),
}

if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
}

klog.V(6).Info("creating NVML events for device health monitor")
eventSet, ret := m.nvmllib.EventSetCreate()
if ret != nvml.SUCCESS {
_ = m.nvmllib.Shutdown()
return nil, fmt.Errorf("failed to create event set: %w", ret)
}
m.eventSet = eventSet

m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)

klog.V(6).Info("registering NVML events for device health monitor")
m.registerDevicesForEvents()

skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore)
klog.V(6).Info("started device health monitoring")
m.wg.Add(1)
go m.run(skippedXids)

return m, nil
}

func (m *deviceHealthMonitor) registerDevicesForEvents() {
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)

processedUUIDs := make(map[string]bool)

for uuid, dev := range m.uuidToDeviceMap {
var u string
if dev.Type() == MigDeviceType {
u = dev.Mig.parent.UUID
} else {
u = uuid
}

if processedUUIDs[u] {
continue
}
gpu, ret := m.nvmllib.DeviceGetHandleByUUID(u)
if ret != nvml.SUCCESS {
klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
continue
}

supportedEvents, ret := gpu.GetSupportedEventTypes()
if ret != nvml.SUCCESS {
klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
continue
}

ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
if ret == nvml.ERROR_NOT_SUPPORTED {
klog.Warningf("Device %v is too old to support healthchecking.", u)
}
if ret != nvml.SUCCESS {
klog.Infof("unable to register events for %s: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
}
Comment on lines +109 to +116
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make it look like the other two checks above? feels odd otherwise. Also the continue is missing (which is present in the other 2 checks above.

Suggested change
ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
if ret == nvml.ERROR_NOT_SUPPORTED {
klog.Warningf("Device %v is too old to support healthchecking.", u)
}
if ret != nvml.SUCCESS {
klog.Infof("unable to register events for %s: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
}
ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_NOT_SUPPORTED {
klog.Warningf("Device %v is too old to support healthchecking.", u)
}
klog.Infof("unable to register events for %s: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
continue
}

processedUUIDs[u] = true
}
}

func (m *deviceHealthMonitor) Stop() {
if m == nil {
return
}
klog.V(6).Info("stopping health monitor")

close(m.stop)
m.wg.Wait()

m.eventSet.Free()

if ret := m.nvmllib.Shutdown(); ret != nvml.SUCCESS {
klog.Warningf("failed to shutdown NVML: %v", ret)
}
close(m.unhealthy)
}

func getUUIDToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableDevice {
uuidToDeviceMap := make(map[string]*AllocatableDevice)

for _, d := range allocatable {
if u := d.GetUUID(); u != "" {
uuidToDeviceMap[u] = d
}
}
return uuidToDeviceMap
}

func (m *deviceHealthMonitor) run(skippedXids map[uint64]bool) {
defer m.wg.Done()
for {
select {
case <-m.stop:
klog.V(6).Info("Stopping event-driven GPU health monitor...")
return
default:
event, ret := m.eventSet.Wait(5000)
if ret == nvml.ERROR_TIMEOUT {
continue
}
if ret != nvml.SUCCESS {
klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret)
for _, dev := range m.uuidToDeviceMap {
m.unhealthy <- dev
}
continue
}

if event.EventType != nvml.EventTypeXidCriticalError {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are registering for nvml.EventTypeDoubleBitEccError and nvml.EventTypeSingleBitEccError also above... see comment above

klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", event)
continue
}

if skippedXids[event.EventData] {
klog.Infof("Skipping event %+v", event)
continue
}

klog.Infof("Processing event %+v", event)
eventUUID, ret := event.Device.GetUUID()
if ret != nvml.SUCCESS {
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret)
for _, dev := range m.uuidToDeviceMap {
m.unhealthy <- dev
}
continue
}

var affectedDevice *AllocatableDevice
if event.GpuInstanceId != InstanceID && event.ComputeInstanceId != InstanceID {
affectedDevice = m.findMigDevice(eventUUID, event.GpuInstanceId, event.ComputeInstanceId)
klog.Infof("Event for mig device: %v", affectedDevice)
} else {
affectedDevice = m.findGpuDevice(eventUUID)
}

if affectedDevice == nil {
klog.Infof("Ignoring event for unexpected device (UUID: %s, GI: %d, CI: %d)", eventUUID, event.GpuInstanceId, event.ComputeInstanceId)
continue
}

klog.Infof("Sending unhealthy notification for device %s due to event type %v", eventUUID, event.EventType)
m.unhealthy <- affectedDevice
}
}
}

func (m *deviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
return m.unhealthy
}

func (m *deviceHealthMonitor) findMigDevice(parentUUID string, giID uint32, ciID uint32) *AllocatableDevice {
for _, device := range m.uuidToDeviceMap {
if device.Type() != MigDeviceType {
continue
}

if device.Mig.parent.UUID == parentUUID &&
device.Mig.giInfo.Id == giID &&
device.Mig.ciInfo.Id == ciID {
return device
}
}
return nil
}

func (m *deviceHealthMonitor) findGpuDevice(uuid string) *AllocatableDevice {
device, exists := m.uuidToDeviceMap[uuid]
if exists && device.Type() == GpuDeviceType {
return device
}
return nil
}

// getAdditionalXids returns a list of additional Xids to skip from the specified string.
// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values.
// Invalid values nare ignored.
func getAdditionalXids(input string) []uint64 {
if input == "" {
return nil
}

var additionalXids []uint64
klog.V(6).Infof("Creating a list of additional xids to ignore: [%s]", input)
for _, additionalXid := range strings.Split(input, ",") {
trimmed := strings.TrimSpace(additionalXid)
if trimmed == "" {
continue
}
xid, err := strconv.ParseUint(trimmed, 10, 64)
if err != nil {
klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err)
continue
}
additionalXids = append(additionalXids, xid)
}

return additionalXids
}

func (m *deviceHealthMonitor) xidsToSkip(additionalXids string) map[uint64]bool {
ignoredXids := []uint64{
13, // Graphics Engine Exception
31, // GPU memory page fault
43, // GPU stopped processing
45, // Preemptive cleanup, due to previous errors
68, // Video processor exception
109, // Context Switch Timeout Error
}

skippedXids := make(map[uint64]bool)
for _, id := range ignoredXids {
skippedXids[id] = true
}

for _, additionalXid := range getAdditionalXids(additionalXids) {
skippedXids[additionalXid] = true
}
return skippedXids
}
32 changes: 31 additions & 1 deletion cmd/gpu-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,14 +288,22 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
// Look through the configs and figure out which one will be applied to
// each device allocation result based on their order of precedence and type.
configResultsMap := make(map[runtime.Object][]*resourceapi.DeviceRequestAllocationResult)
for _, result := range claim.Status.Allocation.Devices.Results {
results := claim.Status.Allocation.Devices.Results
for _, result := range results {
if result.Driver != DriverName {
continue
}
device, exists := s.allocatable[result.Device]
if !exists {
return nil, fmt.Errorf("requested device is not allocatable: %v", result.Device)
}

// SWATI: Confirm if we want to take an action or not
// Only proceed with config mapping for healthy devices
//if device.Health == Unhealthy {
continue
//}

for _, c := range slices.Backward(configs) {
if slices.Contains(c.Requests, result.Request) {
if _, ok := c.Config.(*configapi.GpuConfig); ok && device.Type() != GpuDeviceType {
Expand All @@ -320,6 +328,19 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
}
}

// SWATI: refractor the device status apply
//if featuregates.Enabled(featuregates.DeviceHealthCheck) {
// if err := s.UpdateDeviceConditionInClaim(ctx, claim.Namespace, claim.Name, results); err != nil {
// klog.Warningf("Failed to update status for ResourceClaim %s/%s: %v", claim.Namespace, claim.Name, err)
// }
//}

// SWATI: This is if above action is implemented
// If no healthy devices are available for configuration, return
//if len(configResultsMap) == 0 {
return nil, fmt.Errorf("no healthy devices available for allocation")
//}

// Normalize, validate, and apply all configs associated with devices that
// need to be prepared. Track device group configs generated from applying the
// config to the set of device allocation results.
Expand Down Expand Up @@ -550,6 +571,15 @@ func GetOpaqueDeviceConfigs(
return resultConfigs, nil
}

func (s *DeviceState) MarkDeviceUnhealthy(device *AllocatableDevice) {
// SWATI: check if a mig device is marked properly
s.Lock()
defer s.Unlock()

device.Health = Unhealthy
klog.Infof("Marked device:%s unhealthy", device.GetUUID())
}
Comment on lines +584 to +591
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this take health as a parameter so it can be reused once we have the ability to bring a device back to healthy?


// TODO: Dynamic MIG is not yet supported with structured parameters.
// Refactor this to allow for the allocation of statically partitioned MIG
// devices.
Expand Down
Loading