-
Notifications
You must be signed in to change notification settings - Fork 81
add device data in resourceclaim #128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -17,12 +17,18 @@ | |||||||||||
| package main | ||||||||||||
|
|
||||||||||||
| import ( | ||||||||||||
| "context" | ||||||||||||
| "encoding/json" | ||||||||||||
| "fmt" | ||||||||||||
| "slices" | ||||||||||||
| "sync" | ||||||||||||
|
|
||||||||||||
| resourceapi "k8s.io/api/resource/v1" | ||||||||||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||||||||
| "k8s.io/apimachinery/pkg/runtime" | ||||||||||||
| metav1apply "k8s.io/client-go/applyconfigurations/meta/v1" | ||||||||||||
| resourceapply "k8s.io/client-go/applyconfigurations/resource/v1" | ||||||||||||
| "k8s.io/klog/v2" | ||||||||||||
| drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1" | ||||||||||||
| "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" | ||||||||||||
|
|
||||||||||||
|
|
@@ -61,6 +67,7 @@ type DeviceState struct { | |||||||||||
| cdi *CDIHandler | ||||||||||||
| allocatable AllocatableDevices | ||||||||||||
| checkpointManager checkpointmanager.CheckpointManager | ||||||||||||
| config *Config | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| func NewDeviceState(config *Config) (*DeviceState, error) { | ||||||||||||
|
|
@@ -88,6 +95,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) { | |||||||||||
| cdi: cdi, | ||||||||||||
| allocatable: allocatable, | ||||||||||||
| checkpointManager: checkpointManager, | ||||||||||||
| config: config, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| checkpoints, err := state.checkpointManager.ListCheckpoints() | ||||||||||||
|
|
@@ -109,7 +117,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) { | |||||||||||
| return state, nil | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) { | ||||||||||||
| func (s *DeviceState) Prepare(ctx context.Context, claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) { | ||||||||||||
| s.Lock() | ||||||||||||
| defer s.Unlock() | ||||||||||||
|
|
||||||||||||
|
|
@@ -125,7 +133,7 @@ func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Devi | |||||||||||
| return preparedClaims[claimUID].GetDevices(), nil | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| preparedDevices, err := s.prepareDevices(claim) | ||||||||||||
| preparedDevices, err := s.prepareDevices(ctx, claim) | ||||||||||||
| if err != nil { | ||||||||||||
| return nil, fmt.Errorf("prepare failed: %v", err) | ||||||||||||
| } | ||||||||||||
|
|
@@ -173,7 +181,7 @@ func (s *DeviceState) Unprepare(claimUID string) error { | |||||||||||
| return nil | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (PreparedDevices, error) { | ||||||||||||
| func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.ResourceClaim) (PreparedDevices, error) { | ||||||||||||
| if claim.Status.Allocation == nil { | ||||||||||||
| return nil, fmt.Errorf("claim not yet allocated") | ||||||||||||
| } | ||||||||||||
|
|
@@ -196,13 +204,20 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared | |||||||||||
| Config: configapi.DefaultGpuConfig(), | ||||||||||||
| }) | ||||||||||||
|
|
||||||||||||
| // build device status | ||||||||||||
| var devicesStatus []*resourceapply.AllocatedDeviceStatusApplyConfiguration | ||||||||||||
|
|
||||||||||||
| // Look through the configs and figure out which one will be applied to | ||||||||||||
| // each device allocation result based on their order of precedence. | ||||||||||||
| configResultsMap := make(map[runtime.Object][]*resourceapi.DeviceRequestAllocationResult) | ||||||||||||
| for _, result := range claim.Status.Allocation.Devices.Results { | ||||||||||||
| if _, exists := s.allocatable[result.Device]; !exists { | ||||||||||||
| return nil, fmt.Errorf("requested GPU is not allocatable: %v", result.Device) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| deviceStatus := s.buildDeviceStatus(&result) | ||||||||||||
| devicesStatus = append(devicesStatus, deviceStatus) | ||||||||||||
|
|
||||||||||||
| for _, c := range slices.Backward(configs) { | ||||||||||||
| if len(c.Requests) == 0 || slices.Contains(c.Requests, result.Request) { | ||||||||||||
| configResultsMap[c.Config] = append(configResultsMap[c.Config], &result) | ||||||||||||
|
|
@@ -211,6 +226,11 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared | |||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| klog.Infof("Adding device attribute to claim %s/%s", claim.Namespace, claim.Name) | ||||||||||||
| if err := s.applyDeviceStatus(ctx, claim.Namespace, claim.Name, devicesStatus...); err != nil { | ||||||||||||
| klog.Warningf("Failed to update device attributes for claim %s/%s: %v", claim.Namespace, claim.Name, err) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| // Normalize, validate, and apply all configs associated with devices that | ||||||||||||
| // need to be prepared. Track container edits generated from applying the | ||||||||||||
| // config to the set of device allocation results. | ||||||||||||
|
|
@@ -380,3 +400,56 @@ func GetOpaqueDeviceConfigs( | |||||||||||
|
|
||||||||||||
| return resultConfigs, nil | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| func (s *DeviceState) buildDeviceStatus(res *resourceapi.DeviceRequestAllocationResult) *resourceapply.AllocatedDeviceStatusApplyConfiguration { | ||||||||||||
| dn := res.Device | ||||||||||||
| deviceInfo := make(map[string]interface{}) | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: could we make this more strongly typed?
Suggested change
|
||||||||||||
|
|
||||||||||||
| if d, ok := s.allocatable[dn]; ok { | ||||||||||||
| if d.Attributes != nil { | ||||||||||||
| attributes := d.Attributes | ||||||||||||
|
|
||||||||||||
| if uuid, ok := attributes["uuid"]; ok { | ||||||||||||
|
Comment on lines
+409
to
+412
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: indexing into a nil map doesn't panic, so we can skip this check. I at least don't see any issues skipping it if I remove all of the attributes from the devices and allocate them.
Suggested change
|
||||||||||||
| deviceInfo["uuid"] = uuid | ||||||||||||
| } | ||||||||||||
| if model, ok := attributes["model"]; ok { | ||||||||||||
| deviceInfo["model"] = model | ||||||||||||
| } | ||||||||||||
| if driverVersion, ok := attributes["driverVersion"]; ok { | ||||||||||||
| deviceInfo["driverVersion"] = driverVersion | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| jsonBytes, err := json.Marshal(deviceInfo) | ||||||||||||
| if err != nil { | ||||||||||||
| klog.Errorf("Failed to marshal device data: %v", err) | ||||||||||||
| jsonBytes = []byte("{}") | ||||||||||||
| } | ||||||||||||
| data := runtime.RawExtension{ | ||||||||||||
| Raw: jsonBytes, | ||||||||||||
| } | ||||||||||||
| cond := metav1apply.Condition(). | ||||||||||||
| WithType("Ready"). | ||||||||||||
| WithStatus(metav1.ConditionTrue). | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the NVIDIA driver also only set this condition at the time the claim is allocated, or can it ever change after that? If it can change, I think the example driver should model how to update that. Even if it stays constant, scaffolding out how that can work with something that says "this is where the latest status is determined" would be helpful.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so for nvidia-dra-driver i was looking to use this as part of GPU health check status update. But we dint really see any value of showing this information as part of an allocated claim as it does not really change. Its more complex to show any real updates on an allocated claim. |
||||||||||||
| WithReason("GPUDeviceReady"). | ||||||||||||
| WithMessage("GPUDeviceAllocated"). | ||||||||||||
| WithLastTransitionTime(metav1.Now()) | ||||||||||||
|
|
||||||||||||
| return resourceapply.AllocatedDeviceStatus(). | ||||||||||||
| WithDevice(dn). | ||||||||||||
| WithDriver(res.Driver). | ||||||||||||
| WithPool(res.Pool). | ||||||||||||
| WithConditions(cond). | ||||||||||||
| WithData(data) | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my own understanding, who or what generally consumes this data?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To me, this seems useful for monitoring and debugging when admin want to know pod to gpu mapping, which GPU is being used by a particular pod. right now this info is not readily available. |
||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| func (s *DeviceState) applyDeviceStatus(ctx context.Context, ns, name string, devices ...*resourceapply.AllocatedDeviceStatusApplyConfiguration) error { | ||||||||||||
| claim := resourceapply.ResourceClaim(name, ns). | ||||||||||||
| WithStatus(resourceapply.ResourceClaimStatus().WithDevices(devices...)) | ||||||||||||
|
|
||||||||||||
| opts := metav1.ApplyOptions{FieldManager: consts.DriverName, Force: true} | ||||||||||||
|
|
||||||||||||
| _, err := s.config.coreclient.ResourceV1().ResourceClaims(ns).ApplyStatus(ctx, claim, opts) | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there some lightweight verification we can add to the e2e tests? At least something to verify that something like the condition or one of the attributes is set on even one of the examples would make sure this doesn't totally break later.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure. |
||||||||||||
| return err | ||||||||||||
| } | ||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does
resneed to be a pointer, or can we save the extra syntax and theoretical nil-dereference by making this a plain value? Not a blocking issue, but I may propose refactoring this when I integrate with #129.