Skip to content

Commit 1e3d869

Browse files
committed
Add logic to autogenerate a DeviceClass from a MultiNodeEnvironment
1 parent bc880fc commit 1e3d869

File tree

2 files changed

+143
-6
lines changed

2 files changed

+143
-6
lines changed

cmd/nvidia-dra-controller/mnenv.go

Lines changed: 140 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ import (
4040

4141
const (
4242
multiNodeEnvironmentFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
43-
imexDeviceClass = "imex.nvidia.com"
4443
)
4544

4645
type WorkItem struct {
@@ -55,6 +54,7 @@ type MultiNodeEnvironmentManager struct {
5554
multiNodeEnvironmentInformer cache.SharedIndexInformer
5655
multiNodeEnvironmentLister nvlisters.MultiNodeEnvironmentLister
5756
resourceClaimLister resourcelisters.ResourceClaimLister
57+
deviceClassLister resourcelisters.DeviceClassLister
5858
}
5959

6060
// StartManager starts a MultiNodeEnvironmentManager.
@@ -70,11 +70,15 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
7070
rcInformer := coreInformerFactory.Resource().V1beta1().ResourceClaims().Informer()
7171
rcLister := resourcelisters.NewResourceClaimLister(rcInformer.GetIndexer())
7272

73+
dcInformer := coreInformerFactory.Resource().V1beta1().DeviceClasses().Informer()
74+
dcLister := resourcelisters.NewDeviceClassLister(dcInformer.GetIndexer())
75+
7376
m := &MultiNodeEnvironmentManager{
7477
clientsets: config.clientsets,
7578
multiNodeEnvironmentInformer: mneInformer,
7679
multiNodeEnvironmentLister: mneLister,
7780
resourceClaimLister: rcLister,
81+
deviceClassLister: dcLister,
7882
}
7983

8084
var err error
@@ -106,6 +110,14 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
106110
return nil, fmt.Errorf("error adding event handlers for ResourceClaim informer: %w", err)
107111
}
108112

113+
_, err = dcInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
114+
AddFunc: func(obj any) { queue.Enqueue(obj, m.onDeviceClassAddOrUpdate) },
115+
UpdateFunc: func(objOld, objNew any) { queue.Enqueue(objNew, m.onDeviceClassAddOrUpdate) },
116+
})
117+
if err != nil {
118+
return nil, fmt.Errorf("error adding event handlers for DeviceClass informer: %w", err)
119+
}
120+
109121
m.waitGroup.Add(3)
110122
go func() {
111123
defer m.waitGroup.Done()
@@ -120,10 +132,10 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
120132
queue.Run(ctx.Done())
121133
}()
122134

123-
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced) {
135+
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced, dcInformer.HasSynced) {
124136
klog.Warning("Cache sync failed; retrying in 5 seconds")
125137
time.Sleep(5 * time.Second)
126-
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced) {
138+
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced, dcInformer.HasSynced) {
127139
return nil, fmt.Errorf("informer cache sync failed twice")
128140
}
129141
}
@@ -160,13 +172,57 @@ func (m *MultiNodeEnvironmentManager) onMultiNodeEnvironmentAdd(obj any) error {
160172
Controller: ptr.To(true),
161173
}
162174

163-
if _, err := m.createResourceClaim(mne.Namespace, mne.Spec.ResourceClaimName, ownerReference); err != nil {
175+
dc, err := m.createDeviceClass("", ownerReference)
176+
if err != nil {
177+
return fmt.Errorf("error creating DeviceClass '%s': %w", "<generated-name>", err)
178+
}
179+
180+
if _, err := m.createResourceClaim(mne.Namespace, mne.Spec.ResourceClaimName, dc.Name, ownerReference); err != nil {
164181
return fmt.Errorf("error creating ResourceClaim '%s/%s': %w", mne.Namespace, mne.Spec.ResourceClaimName, err)
165182
}
166183

167184
return nil
168185
}
169186

187+
func (m *MultiNodeEnvironmentManager) onDeviceClassAddOrUpdate(obj any) error {
188+
dc, ok := obj.(*resourceapi.DeviceClass)
189+
if !ok {
190+
return fmt.Errorf("failed to cast to DeviceClass")
191+
}
192+
193+
klog.Infof("Processing added or updated DeviceClass: %s", dc.Name)
194+
195+
if len(dc.OwnerReferences) != 1 {
196+
return nil
197+
}
198+
199+
if dc.OwnerReferences[0].Kind != nvapi.MultiNodeEnvironmentKind {
200+
return nil
201+
}
202+
203+
if !cache.WaitForCacheSync(context.Background().Done(), m.multiNodeEnvironmentInformer.HasSynced) {
204+
return fmt.Errorf("cache sync failed for MultiNodeEnvironment")
205+
}
206+
207+
mnes, err := m.multiNodeEnvironmentInformer.GetIndexer().ByIndex("uid", string(dc.OwnerReferences[0].UID))
208+
if err != nil {
209+
return fmt.Errorf("error retrieving MultiNodeInformer OwnerReference by UID from indexer: %w", err)
210+
}
211+
if len(mnes) != 0 {
212+
return nil
213+
}
214+
215+
if err := m.removeDeviceClassFinalizer(dc.Name); err != nil {
216+
return fmt.Errorf("error removing finalizer on DeviceClass '%s': %w", dc.Name, err)
217+
}
218+
219+
if err := m.deleteDeviceClass(dc.Name); err != nil {
220+
return fmt.Errorf("error deleting DeviceClass '%s': %w", dc.Name, err)
221+
}
222+
223+
return nil
224+
}
225+
170226
func (m *MultiNodeEnvironmentManager) onResourceClaimAddOrUpdate(obj any) error {
171227
rc, ok := obj.(*resourceapi.ResourceClaim)
172228
if !ok {
@@ -206,7 +262,51 @@ func (m *MultiNodeEnvironmentManager) onResourceClaimAddOrUpdate(obj any) error
206262
return nil
207263
}
208264

209-
func (m *MultiNodeEnvironmentManager) createResourceClaim(namespace, name string, ownerReference metav1.OwnerReference) (*resourceapi.ResourceClaim, error) {
265+
func (m *MultiNodeEnvironmentManager) createDeviceClass(name string, ownerReference metav1.OwnerReference) (*resourceapi.DeviceClass, error) {
266+
if name != "" {
267+
dc, err := m.deviceClassLister.Get(name)
268+
if err == nil {
269+
if len(dc.OwnerReferences) != 1 && dc.OwnerReferences[0] != ownerReference {
270+
return nil, fmt.Errorf("DeviceClass '%s' exists without expected OwnerReference: %v", name, ownerReference)
271+
}
272+
return dc, nil
273+
}
274+
if !errors.IsNotFound(err) {
275+
return nil, fmt.Errorf("error retrieving DeviceClass: %w", err)
276+
}
277+
}
278+
279+
deviceClass := &resourceapi.DeviceClass{
280+
ObjectMeta: metav1.ObjectMeta{
281+
OwnerReferences: []metav1.OwnerReference{ownerReference},
282+
Finalizers: []string{multiNodeEnvironmentFinalizer},
283+
},
284+
Spec: resourceapi.DeviceClassSpec{
285+
Selectors: []resourceapi.DeviceSelector{
286+
{
287+
CEL: &resourceapi.CELDeviceSelector{
288+
Expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'imex-channel'",
289+
},
290+
},
291+
},
292+
},
293+
}
294+
295+
if name == "" {
296+
deviceClass.GenerateName = ownerReference.Name
297+
} else {
298+
deviceClass.Name = name
299+
}
300+
301+
dc, err := m.clientsets.Core.ResourceV1beta1().DeviceClasses().Create(context.Background(), deviceClass, metav1.CreateOptions{})
302+
if err != nil {
303+
return nil, fmt.Errorf("error creating DeviceClass: %w", err)
304+
}
305+
306+
return dc, nil
307+
}
308+
309+
func (m *MultiNodeEnvironmentManager) createResourceClaim(namespace, name, deviceClassName string, ownerReference metav1.OwnerReference) (*resourceapi.ResourceClaim, error) {
210310
rc, err := m.resourceClaimLister.ResourceClaims(namespace).Get(name)
211311
if err == nil {
212312
if len(rc.OwnerReferences) != 1 && rc.OwnerReferences[0] != ownerReference {
@@ -228,7 +328,7 @@ func (m *MultiNodeEnvironmentManager) createResourceClaim(namespace, name string
228328
Spec: resourceapi.ResourceClaimSpec{
229329
Devices: resourceapi.DeviceClaim{
230330
Requests: []resourceapi.DeviceRequest{{
231-
Name: "imex", DeviceClassName: imexDeviceClass,
331+
Name: "device", DeviceClassName: deviceClassName,
232332
}},
233333
},
234334
},
@@ -242,6 +342,32 @@ func (m *MultiNodeEnvironmentManager) createResourceClaim(namespace, name string
242342
return rc, nil
243343
}
244344

345+
func (m *MultiNodeEnvironmentManager) removeDeviceClassFinalizer(name string) error {
346+
dc, err := m.deviceClassLister.Get(name)
347+
if err != nil && errors.IsNotFound(err) {
348+
return nil
349+
}
350+
if err != nil {
351+
return fmt.Errorf("error retrieving DeviceClass: %w", err)
352+
}
353+
354+
newDC := dc.DeepCopy()
355+
356+
newDC.Finalizers = []string{}
357+
for _, f := range dc.Finalizers {
358+
if f != multiNodeEnvironmentFinalizer {
359+
newDC.Finalizers = append(newDC.Finalizers, f)
360+
}
361+
}
362+
363+
_, err = m.clientsets.Core.ResourceV1beta1().DeviceClasses().Update(context.Background(), newDC, metav1.UpdateOptions{})
364+
if err != nil {
365+
return fmt.Errorf("error updating DeviceClass: %w", err)
366+
}
367+
368+
return nil
369+
}
370+
245371
func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, name string) error {
246372
rc, err := m.resourceClaimLister.ResourceClaims(namespace).Get(name)
247373
if err != nil && errors.IsNotFound(err) {
@@ -268,6 +394,14 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
268394
return nil
269395
}
270396

397+
func (m *MultiNodeEnvironmentManager) deleteDeviceClass(name string) error {
398+
err := m.clientsets.Core.ResourceV1beta1().DeviceClasses().Delete(context.Background(), name, metav1.DeleteOptions{})
399+
if err != nil && !errors.IsNotFound(err) {
400+
return fmt.Errorf("erroring deleting DeviceClass: %w", err)
401+
}
402+
return nil
403+
}
404+
271405
func (m *MultiNodeEnvironmentManager) deleteResourceClaim(namespace, name string) error {
272406
err := m.clientsets.Core.ResourceV1beta1().ResourceClaims(namespace).Delete(context.Background(), name, metav1.DeleteOptions{})
273407
if err != nil && !errors.IsNotFound(err) {

deployments/helm/k8s-dra-driver/templates/clusterrole.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ rules:
1111
- apiGroups: ["resource.k8s.io"]
1212
resources: ["resourceclaims"]
1313
verbs: ["get", "list", "watch", "create", "update", "delete"]
14+
- apiGroups: ["resource.k8s.io"]
15+
resources: ["deviceclasses"]
16+
verbs: ["get", "list", "watch", "create", "update", "delete"]
1417
- apiGroups: ["resource.k8s.io"]
1518
resources: ["resourceclaims/status"]
1619
verbs: ["update"]

0 commit comments

Comments
 (0)