@@ -40,7 +40,6 @@ import (
4040
4141const (
4242 multiNodeEnvironmentFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
43- imexDeviceClass = "imex.nvidia.com"
4443)
4544
4645type WorkItem struct {
@@ -55,6 +54,7 @@ type MultiNodeEnvironmentManager struct {
5554 multiNodeEnvironmentInformer cache.SharedIndexInformer
5655 multiNodeEnvironmentLister nvlisters.MultiNodeEnvironmentLister
5756 resourceClaimLister resourcelisters.ResourceClaimLister
57+ deviceClassLister resourcelisters.DeviceClassLister
5858}
5959
6060// StartManager starts a MultiNodeEnvironmentManager.
@@ -70,11 +70,15 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
7070 rcInformer := coreInformerFactory .Resource ().V1beta1 ().ResourceClaims ().Informer ()
7171 rcLister := resourcelisters .NewResourceClaimLister (rcInformer .GetIndexer ())
7272
73+ dcInformer := coreInformerFactory .Resource ().V1beta1 ().DeviceClasses ().Informer ()
74+ dcLister := resourcelisters .NewDeviceClassLister (dcInformer .GetIndexer ())
75+
7376 m := & MultiNodeEnvironmentManager {
7477 clientsets : config .clientsets ,
7578 multiNodeEnvironmentInformer : mneInformer ,
7679 multiNodeEnvironmentLister : mneLister ,
7780 resourceClaimLister : rcLister ,
81+ deviceClassLister : dcLister ,
7882 }
7983
8084 var err error
@@ -106,6 +110,14 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
106110 return nil , fmt .Errorf ("error adding event handlers for ResourceClaim informer: %w" , err )
107111 }
108112
113+ _ , err = dcInformer .AddEventHandler (cache.ResourceEventHandlerFuncs {
114+ AddFunc : func (obj any ) { queue .Enqueue (obj , m .onDeviceClassAddOrUpdate ) },
115+ UpdateFunc : func (objOld , objNew any ) { queue .Enqueue (objNew , m .onDeviceClassAddOrUpdate ) },
116+ })
117+ if err != nil {
118+ return nil , fmt .Errorf ("error adding event handlers for DeviceClass informer: %w" , err )
119+ }
120+
109121 m .waitGroup .Add (3 )
110122 go func () {
111123 defer m .waitGroup .Done ()
@@ -120,10 +132,10 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
120132 queue .Run (ctx .Done ())
121133 }()
122134
123- if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced ) {
135+ if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced , dcInformer . HasSynced ) {
124136 klog .Warning ("Cache sync failed; retrying in 5 seconds" )
125137 time .Sleep (5 * time .Second )
126- if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced ) {
138+ if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced , dcInformer . HasSynced ) {
127139 return nil , fmt .Errorf ("informer cache sync failed twice" )
128140 }
129141 }
@@ -160,13 +172,57 @@ func (m *MultiNodeEnvironmentManager) onMultiNodeEnvironmentAdd(obj any) error {
160172 Controller : ptr .To (true ),
161173 }
162174
163- if _ , err := m .createResourceClaim (mne .Namespace , mne .Spec .ResourceClaimName , ownerReference ); err != nil {
175+ dc , err := m .createDeviceClass ("" , ownerReference )
176+ if err != nil {
177+ return fmt .Errorf ("error creating DeviceClass '%s': %w" , "<generated-name>" , err )
178+ }
179+
180+ if _ , err := m .createResourceClaim (mne .Namespace , mne .Spec .ResourceClaimName , dc .Name , ownerReference ); err != nil {
164181 return fmt .Errorf ("error creating ResourceClaim '%s/%s': %w" , mne .Namespace , mne .Spec .ResourceClaimName , err )
165182 }
166183
167184 return nil
168185}
169186
187+ func (m * MultiNodeEnvironmentManager ) onDeviceClassAddOrUpdate (obj any ) error {
188+ dc , ok := obj .(* resourceapi.DeviceClass )
189+ if ! ok {
190+ return fmt .Errorf ("failed to cast to DeviceClass" )
191+ }
192+
193+ klog .Infof ("Processing added or updated DeviceClass: %s" , dc .Name )
194+
195+ if len (dc .OwnerReferences ) != 1 {
196+ return nil
197+ }
198+
199+ if dc .OwnerReferences [0 ].Kind != nvapi .MultiNodeEnvironmentKind {
200+ return nil
201+ }
202+
203+ if ! cache .WaitForCacheSync (context .Background ().Done (), m .multiNodeEnvironmentInformer .HasSynced ) {
204+ return fmt .Errorf ("cache sync failed for MultiNodeEnvironment" )
205+ }
206+
207+ mnes , err := m .multiNodeEnvironmentInformer .GetIndexer ().ByIndex ("uid" , string (dc .OwnerReferences [0 ].UID ))
208+ if err != nil {
209+ return fmt .Errorf ("error retrieving MultiNodeInformer OwnerReference by UID from indexer: %w" , err )
210+ }
211+ if len (mnes ) != 0 {
212+ return nil
213+ }
214+
215+ if err := m .removeDeviceClassFinalizer (dc .Name ); err != nil {
216+ return fmt .Errorf ("error removing finalizer on DeviceClass '%s': %w" , dc .Name , err )
217+ }
218+
219+ if err := m .deleteDeviceClass (dc .Name ); err != nil {
220+ return fmt .Errorf ("error deleting DeviceClass '%s': %w" , dc .Name , err )
221+ }
222+
223+ return nil
224+ }
225+
170226func (m * MultiNodeEnvironmentManager ) onResourceClaimAddOrUpdate (obj any ) error {
171227 rc , ok := obj .(* resourceapi.ResourceClaim )
172228 if ! ok {
@@ -206,7 +262,51 @@ func (m *MultiNodeEnvironmentManager) onResourceClaimAddOrUpdate(obj any) error
206262 return nil
207263}
208264
209- func (m * MultiNodeEnvironmentManager ) createResourceClaim (namespace , name string , ownerReference metav1.OwnerReference ) (* resourceapi.ResourceClaim , error ) {
265+ func (m * MultiNodeEnvironmentManager ) createDeviceClass (name string , ownerReference metav1.OwnerReference ) (* resourceapi.DeviceClass , error ) {
266+ if name != "" {
267+ dc , err := m .deviceClassLister .Get (name )
268+ if err == nil {
269+ if len (dc .OwnerReferences ) != 1 && dc .OwnerReferences [0 ] != ownerReference {
270+ return nil , fmt .Errorf ("DeviceClass '%s' exists without expected OwnerReference: %v" , name , ownerReference )
271+ }
272+ return dc , nil
273+ }
274+ if ! errors .IsNotFound (err ) {
275+ return nil , fmt .Errorf ("error retrieving DeviceClass: %w" , err )
276+ }
277+ }
278+
279+ deviceClass := & resourceapi.DeviceClass {
280+ ObjectMeta : metav1.ObjectMeta {
281+ OwnerReferences : []metav1.OwnerReference {ownerReference },
282+ Finalizers : []string {multiNodeEnvironmentFinalizer },
283+ },
284+ Spec : resourceapi.DeviceClassSpec {
285+ Selectors : []resourceapi.DeviceSelector {
286+ {
287+ CEL : & resourceapi.CELDeviceSelector {
288+ Expression : "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'imex-channel'" ,
289+ },
290+ },
291+ },
292+ },
293+ }
294+
295+ if name == "" {
296+ deviceClass .GenerateName = ownerReference .Name
297+ } else {
298+ deviceClass .Name = name
299+ }
300+
301+ dc , err := m .clientsets .Core .ResourceV1beta1 ().DeviceClasses ().Create (context .Background (), deviceClass , metav1.CreateOptions {})
302+ if err != nil {
303+ return nil , fmt .Errorf ("error creating DeviceClass: %w" , err )
304+ }
305+
306+ return dc , nil
307+ }
308+
309+ func (m * MultiNodeEnvironmentManager ) createResourceClaim (namespace , name , deviceClassName string , ownerReference metav1.OwnerReference ) (* resourceapi.ResourceClaim , error ) {
210310 rc , err := m .resourceClaimLister .ResourceClaims (namespace ).Get (name )
211311 if err == nil {
212312 if len (rc .OwnerReferences ) != 1 && rc .OwnerReferences [0 ] != ownerReference {
@@ -228,7 +328,7 @@ func (m *MultiNodeEnvironmentManager) createResourceClaim(namespace, name string
228328 Spec : resourceapi.ResourceClaimSpec {
229329 Devices : resourceapi.DeviceClaim {
230330 Requests : []resourceapi.DeviceRequest {{
231- Name : "imex " , DeviceClassName : imexDeviceClass ,
331+ Name : "device " , DeviceClassName : deviceClassName ,
232332 }},
233333 },
234334 },
@@ -242,6 +342,32 @@ func (m *MultiNodeEnvironmentManager) createResourceClaim(namespace, name string
242342 return rc , nil
243343}
244344
345+ func (m * MultiNodeEnvironmentManager ) removeDeviceClassFinalizer (name string ) error {
346+ dc , err := m .deviceClassLister .Get (name )
347+ if err != nil && errors .IsNotFound (err ) {
348+ return nil
349+ }
350+ if err != nil {
351+ return fmt .Errorf ("error retrieving DeviceClass: %w" , err )
352+ }
353+
354+ newDC := dc .DeepCopy ()
355+
356+ newDC .Finalizers = []string {}
357+ for _ , f := range dc .Finalizers {
358+ if f != multiNodeEnvironmentFinalizer {
359+ newDC .Finalizers = append (newDC .Finalizers , f )
360+ }
361+ }
362+
363+ _ , err = m .clientsets .Core .ResourceV1beta1 ().DeviceClasses ().Update (context .Background (), newDC , metav1.UpdateOptions {})
364+ if err != nil {
365+ return fmt .Errorf ("error updating DeviceClass: %w" , err )
366+ }
367+
368+ return nil
369+ }
370+
245371func (m * MultiNodeEnvironmentManager ) removeResourceClaimFinalizer (namespace , name string ) error {
246372 rc , err := m .resourceClaimLister .ResourceClaims (namespace ).Get (name )
247373 if err != nil && errors .IsNotFound (err ) {
@@ -268,6 +394,14 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
268394 return nil
269395}
270396
397+ func (m * MultiNodeEnvironmentManager ) deleteDeviceClass (name string ) error {
398+ err := m .clientsets .Core .ResourceV1beta1 ().DeviceClasses ().Delete (context .Background (), name , metav1.DeleteOptions {})
399+ if err != nil && ! errors .IsNotFound (err ) {
400+ return fmt .Errorf ("erroring deleting DeviceClass: %w" , err )
401+ }
402+ return nil
403+ }
404+
271405func (m * MultiNodeEnvironmentManager ) deleteResourceClaim (namespace , name string ) error {
272406 err := m .clientsets .Core .ResourceV1beta1 ().ResourceClaims (namespace ).Delete (context .Background (), name , metav1.DeleteOptions {})
273407 if err != nil && ! errors .IsNotFound (err ) {
0 commit comments