Skip to content
Open
15 changes: 15 additions & 0 deletions cmd/gpu-kubelet-plugin/allocatable.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,22 @@ import (
resourceapi "k8s.io/api/resource/v1"
)

type HealthStatus string

const (
// Healthy means that the device is healthy.
Healthy HealthStatus = "Healthy"
// Unhealthy means that the device is unhealthy.
Unhealthy HealthStatus = "Unhealthy"
)

type AllocatableDevices map[string]*AllocatableDevice

type AllocatableDevice struct {
Gpu *GpuInfo
Mig *MigDeviceInfo
// Defined similarly as https://pkg.go.dev/k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1#Healthy
Health HealthStatus
}

func (d AllocatableDevice) Type() string {
Expand Down Expand Up @@ -96,3 +107,7 @@ func (d AllocatableDevices) UUIDs() []string {
slices.Sort(uuids)
return uuids
}

func (d *AllocatableDevice) IsHealthy() bool {
return d.Health == Healthy
}
342 changes: 342 additions & 0 deletions cmd/gpu-kubelet-plugin/device_health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,342 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"fmt"
"strconv"
"strings"
"sync"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"k8s.io/klog/v2"
)

const (
FullGPUInstanceID uint32 = 0xFFFFFFFF
)

// For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>.
// For a full device the returned 3-tuple is the device's uuid and (FullGPUInstanceID) 0xFFFFFFFF for the other two elements.
type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice

type nvmlDeviceHealthMonitor struct {
nvmllib nvml.Interface
eventSet nvml.EventSet
unhealthy chan *AllocatableDevice
cancelContext context.CancelFunc
deviceByPlacement devicePlacementMap
skippedXids map[uint64]bool
wg sync.WaitGroup
}

func newNvmlDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
if nvdevlib.nvmllib == nil {
return nil, fmt.Errorf("nvml library is nil")
}
if ret := nvdevlib.nvmllib.Init(); ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
}
defer func() {
_ = nvdevlib.nvmllib.Shutdown()
}()

m := &nvmlDeviceHealthMonitor{
nvmllib: nvdevlib.nvmllib,
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
deviceByPlacement: getDevicePlacementMap(allocatable),
skippedXids: xidsToSkip(config.flags.additionalXidsToIgnore),
}
return m, nil
}

func (m *nvmlDeviceHealthMonitor) Start(ctx context.Context) (err error) {
if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
return fmt.Errorf("failed to initialize NVML: %v", ret)
}

defer func() {
if err != nil {
_ = m.nvmllib.Shutdown()
}
}()

klog.V(6).Info("creating NVML events for device health monitor")
eventSet, ret := m.nvmllib.EventSetCreate()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to create event set: %w", ret)
}

ctx, cancel := context.WithCancel(ctx)
m.cancelContext = cancel

m.eventSet = eventSet

klog.V(6).Info("registering NVML events for device health monitor")
m.registerEventsForDevices()

klog.V(6).Info("started device health monitoring")
m.wg.Add(1)
go m.run(ctx)

return nil
}

func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)

for parentUUID, giMap := range m.deviceByPlacement {
gpu, ret := m.nvmllib.DeviceGetHandleByUUID(parentUUID)
if ret != nvml.SUCCESS {
klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", parentUUID, ret)
m.markAllSlicesUnhealthy(giMap)
continue
}

supportedEvents, ret := gpu.GetSupportedEventTypes()
if ret != nvml.SUCCESS {
klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", parentUUID, ret)
m.markAllSlicesUnhealthy(giMap)
continue
}

ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
if ret == nvml.ERROR_NOT_SUPPORTED {
klog.Warningf("Device %v is too old to support healthchecking.", parentUUID)
}
if ret != nvml.SUCCESS {
klog.Infof("unable to register events for %s: %v; marking it as unhealthy", parentUUID, ret)
m.markAllSlicesUnhealthy(giMap)
}
}
}

func (m *nvmlDeviceHealthMonitor) Stop() {
if m == nil {
return
}
klog.V(6).Info("stopping health monitor")

if m.cancelContext != nil {
m.cancelContext()
}

m.wg.Wait()

if ret := m.eventSet.Free(); ret != nvml.SUCCESS {
klog.Warningf("failed to unset events: %v", ret)
}

if ret := m.nvmllib.Shutdown(); ret != nvml.SUCCESS {
klog.Warningf("failed to shutdown NVML: %v", ret)
}
close(m.unhealthy)
}

func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
defer m.wg.Done()
for {
select {
case <-ctx.Done():
klog.V(6).Info("Stopping event-driven GPU health monitor...")
return
default:
event, ret := m.eventSet.Wait(5000) // timeout in 5000 ms.
if ret == nvml.ERROR_TIMEOUT {
continue
}
// not all return errors are handled as currently there is no proper way to process these errors other than marking all devices healthy.
// Ref doc: [https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c].
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_GPU_IS_LOST {
klog.Infof("GPU is lost error: %v; Marking all devices as unhealthy", ret)
m.markAllDevicesUnhealthy()
continue
}
klog.Warningf("Error waiting for NVML event: %v. Retrying...", ret)
continue
}

// To-do: check why other supported types are not considered?
eType := event.EventType
eData := event.EventData
eGI := event.GpuInstanceId
eCI := event.ComputeInstanceId
if eType != nvml.EventTypeXidCriticalError {
klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: Data=%d, Type=%d, GI=%d, CI=%d", eData, eType, eGI, eCI)
continue
}

if m.skippedXids[eData] {
klog.Infof("Skipping XID event: Data=%d, Type=%d, GI=%d, CI=%d", eData, eType, eGI, eCI)
continue
}

klog.Infof("Processing event XID=%d event", eData)
// this seems an extreme action.
// should we just log the error and proceed anyway.
// To-do: look into how to properly handle this error.
eventUUID, ret := event.Device.GetUUID()
if ret != nvml.SUCCESS {
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems bit aggressive to mark all devices as unhealthy on one invalid event. Should we log this as error and continue watch? cc @klueska

Copy link
Contributor Author

@guptaNswati guptaNswati Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

@jgehrcke jgehrcke Oct 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also say we should log an error and otherwise proceed. Even if what you've shown here is currently done in the device plugin.

By the way, this would have been a perfect opportunity for a better code comment in the legacy code:

Image

No blame, no emotions -- but this code comment does not add information in addition to the code. The interesting bit would be if there is a specific, non-obvious reason / relevance for this style of treatment.

For example, I wonder if this code was introduced to fix a bug. I wonder if it is even ever exercised.

The way it's written and with the git blame history, it seems like it was potentially added initially (defensively) and may never have been exercised in production.

m.markAllDevicesUnhealthy()
continue
}
affectedDevice := m.deviceByPlacement.get(eventUUID, eGI, eCI)
if affectedDevice == nil {
klog.Infof("Ignoring event for unexpected device (UUID:%s, GI:%d, CI:%d)", eventUUID, eGI, eCI)
continue
}

klog.Infof("Sending unhealthy notification for device %s due to event type:%v and event data:%d", affectedDevice.UUID(), eType, eData)
m.unhealthy <- affectedDevice
}
}
}

func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
return m.unhealthy
}

func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() {
for _, giMap := range m.deviceByPlacement {
m.markAllSlicesUnhealthy(giMap)
}
}

// markAllSlicesUnhealthy is a helper function to mark every mig device under a parent as unhealthy.
func (m *nvmlDeviceHealthMonitor) markAllSlicesUnhealthy(giMap map[uint32]map[uint32]*AllocatableDevice) {
for _, ciMap := range giMap {
for _, dev := range ciMap {
// Non-blocking send to avoid deadlocks if channel is full.
select {
case m.unhealthy <- dev:
klog.V(6).Infof("Marked device %s as unhealthy", dev.UUID())
default:
klog.Errorf("Unhealthy channel full. Dropping unhealthy notification for device %s", dev.UUID())
}
}
}
}

func getDevicePlacementMap(allocatable AllocatableDevices) devicePlacementMap {
placementMap := make(devicePlacementMap)

for _, d := range allocatable {
var parentUUID string
var giID, ciID uint32

switch d.Type() {
case GpuDeviceType:
parentUUID = d.UUID()
if parentUUID == "" {
continue
}
giID = FullGPUInstanceID
ciID = FullGPUInstanceID

case MigDeviceType:
parentUUID = d.Mig.parent.UUID
if parentUUID == "" {
continue
}
giID = d.Mig.giInfo.Id
ciID = d.Mig.ciInfo.Id

default:
klog.Errorf("Skipping device with unknown type: %s", d.Type())
continue
}
placementMap.addDevice(parentUUID, giID, ciID, d)
}
return placementMap
}

func (p devicePlacementMap) addDevice(parentUUID string, giID uint32, ciID uint32, d *AllocatableDevice) {
if _, ok := p[parentUUID]; !ok {
p[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice)
}
if _, ok := p[parentUUID][giID]; !ok {
p[parentUUID][giID] = make(map[uint32]*AllocatableDevice)
}
p[parentUUID][giID][ciID] = d
}

func (p devicePlacementMap) get(uuid string, gi, ci uint32) *AllocatableDevice {
giMap, ok := p[uuid]
if !ok {
return nil
}

ciMap, ok := giMap[gi]
if !ok {
return nil
}
return ciMap[ci]
}

// getAdditionalXids returns a list of additional Xids to skip from the specified string.
// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values.
// Invalid values nare ignored.
// To-do: add list of EXPLICIT XIDs from [https://github.com/NVIDIA/k8s-device-plugin/pull/1443].
func getAdditionalXids(input string) []uint64 {
if input == "" {
return nil
}

var additionalXids []uint64
klog.V(6).Infof("Creating a list of additional xids to ignore: [%s]", input)
for _, additionalXid := range strings.Split(input, ",") {
trimmed := strings.TrimSpace(additionalXid)
if trimmed == "" {
continue
}
xid, err := strconv.ParseUint(trimmed, 10, 64)
if err != nil {
klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err)
continue
}
additionalXids = append(additionalXids, xid)
}

return additionalXids
}

func xidsToSkip(additionalXids string) map[uint64]bool {
// Add the list of hardcoded disabled (ignored) XIDs:
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy.
ignoredXids := []uint64{
13, // Graphics Engine Exception
31, // GPU memory page fault
43, // GPU stopped processing
45, // Preemptive cleanup, due to previous errors
68, // Video processor exception
109, // Context Switch Timeout Error
}
Comment on lines +321 to +331
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know that this list is taken from the device plugin, but handling them explicity at such a low-level is quite difficult to customize. Does it make sense to not port this logic over by instead define these as the default value for the envvar where we expose this to the user?

(I'm happy to do this as a follow-up though).

Just as a note for completeness. The GKE device plugin doesn't use this strategy for XIDs. They only element in their list is XID48 (see https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/0509b1f9f4b9a357b44ba65e7b508ded8bd5ecf0/pkg/gpu/nvidia/health_check/health_checker.go#L59).


skippedXids := make(map[uint64]bool)
for _, id := range ignoredXids {
skippedXids[id] = true
}

for _, additionalXid := range getAdditionalXids(additionalXids) {
skippedXids[additionalXid] = true
}
return skippedXids
}
Loading