Skip to content

Commit fb325f3

Browse files
committed
Add support for multiple channels within a single ComputeDomain
Signed-off-by: Kevin Klues <[email protected]>
1 parent 63b5999 commit fb325f3

File tree

12 files changed

+105
-78
lines changed

12 files changed

+105
-78
lines changed

api/nvidia.com/resource/v1beta1/computedomain.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,13 @@ type ComputeDomainList struct {
5151
}
5252

5353
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="A computeDomain.spec is immutable"
54+
// +kubebuilder:validation:XValidation:rule="size(self.resourceClaimTemplates) >= 1",message="The 'resourceClaimTemplates' field must have at least one entry."
55+
// +kubebuilder:validation:XValidation:rule="size(self.resourceClaimTemplates) < 64",message="The 'resourceClaimTemplates' field must have less than 64 entries."
5456

5557
// ComputeDomainSpec provides the spec for a ComputeDomain.
5658
type ComputeDomainSpec struct {
57-
NumNodes int `json:"numNodes"`
58-
ResourceClaimTemplate ComputeDomainResourceClaimTemplate `json:"resourceClaimTemplate"`
59+
NumNodes int `json:"numNodes"`
60+
ResourceClaimTemplates []ComputeDomainResourceClaimTemplate `json:"resourceClaimTemplates"`
5961
}
6062

6163
// ComputeDomainResourceClaimTemplate provides the details of the ResourceClaimTemplate to generate.

api/nvidia.com/resource/v1beta1/zz_generated.deepcopy.go

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/compute-domain-controller/computedomain.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,8 @@ const (
3838
computeDomainLabelKey = "resource.nvidia.com/computeDomain"
3939
computeDomainFinalizer = computeDomainLabelKey
4040

41-
computeDomainDefaultChannelDeviceClass = "compute-domain-default-channel.nvidia.com"
42-
computeDomainChannelDeviceClass = "compute-domain-channel.nvidia.com"
43-
computeDomainDaemonDeviceClass = "compute-domain-daemon.nvidia.com"
41+
computeDomainChannelDeviceClass = "compute-domain-channel.nvidia.com"
42+
computeDomainDaemonDeviceClass = "compute-domain-daemon.nvidia.com"
4443

4544
computeDomainResourceClaimTemplateTargetLabelKey = "resource.nvidia.com/computeDomainTarget"
4645
computeDomainResourceClaimTemplateTargetDaemon = "Daemon"
@@ -289,8 +288,10 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
289288
return fmt.Errorf("error creating DaemonSet: %w", err)
290289
}
291290

292-
if _, err := m.resourceClaimTemplateManager.Create(ctx, cd.Namespace, cd.Spec.ResourceClaimTemplate.Name, cd); err != nil {
293-
return fmt.Errorf("error creating ResourceClaimTemplate '%s/%s': %w", cd.Namespace, cd.Spec.ResourceClaimTemplate.Name, err)
291+
for i, rct := range cd.Spec.ResourceClaimTemplates {
292+
if _, err := m.resourceClaimTemplateManager.Create(ctx, cd.Namespace, rct.Name, i, cd); err != nil {
293+
return fmt.Errorf("error creating ResourceClaimTemplate '%s/%s': %w", cd.Namespace, rct.Name, err)
294+
}
294295
}
295296

296297
return nil

cmd/compute-domain-controller/resourceclaimtemplate.go

Lines changed: 32 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ type ResourceClaimTemplateTemplateData struct {
5353
TargetLabelValue string
5454
DeviceClassName string
5555
DriverName string
56+
ChannelID int
5657
ChannelConfig *nvapi.ComputeDomainChannelConfig
5758
DaemonConfig *nvapi.ComputeDomainDaemonConfig
5859
}
@@ -166,22 +167,16 @@ func (m *BaseResourceClaimTemplateManager) Delete(ctx context.Context, cdUID str
166167
if err != nil {
167168
return fmt.Errorf("error retrieving ResourceClaimTemplate: %w", err)
168169
}
169-
if len(rcts) > 1 {
170-
return fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID")
171-
}
172-
if len(rcts) == 0 {
173-
return nil
174-
}
175-
176-
rct := rcts[0]
177170

178-
if rct.GetDeletionTimestamp() != nil {
179-
return nil
180-
}
171+
for _, rct := range rcts {
172+
if rct.GetDeletionTimestamp() != nil {
173+
continue
174+
}
181175

182-
err = m.config.clientsets.Core.ResourceV1beta1().ResourceClaimTemplates(rct.Namespace).Delete(ctx, rct.Name, metav1.DeleteOptions{})
183-
if err != nil && !errors.IsNotFound(err) {
184-
return fmt.Errorf("erroring deleting ResourceClaimTemplate: %w", err)
176+
err := m.config.clientsets.Core.ResourceV1beta1().ResourceClaimTemplates(rct.Namespace).Delete(ctx, rct.Name, metav1.DeleteOptions{})
177+
if err != nil && !errors.IsNotFound(err) {
178+
return fmt.Errorf("erroring deleting ResourceClaimTemplate: %w", err)
179+
}
185180
}
186181

187182
return nil
@@ -192,32 +187,26 @@ func (m *BaseResourceClaimTemplateManager) RemoveFinalizer(ctx context.Context,
192187
if err != nil {
193188
return fmt.Errorf("error retrieving ResourceClaimTemplate: %w", err)
194189
}
195-
if len(rcts) > 1 {
196-
return fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID")
197-
}
198-
if len(rcts) == 0 {
199-
return nil
200-
}
201-
202-
rct := rcts[0]
203190

204-
if rct.GetDeletionTimestamp() == nil {
205-
return fmt.Errorf("attempting to remove finalizer before ResourceClaimTemplate marked for deletion")
206-
}
191+
for _, rct := range rcts {
192+
if rct.GetDeletionTimestamp() == nil {
193+
return fmt.Errorf("attempting to remove finalizer before ResourceClaimTemplate marked for deletion")
194+
}
207195

208-
newRCT := rct.DeepCopy()
209-
newRCT.Finalizers = []string{}
210-
for _, f := range rct.Finalizers {
211-
if f != computeDomainFinalizer {
212-
newRCT.Finalizers = append(newRCT.Finalizers, f)
196+
newRCT := rct.DeepCopy()
197+
newRCT.Finalizers = []string{}
198+
for _, f := range rct.Finalizers {
199+
if f != computeDomainFinalizer {
200+
newRCT.Finalizers = append(newRCT.Finalizers, f)
201+
}
202+
}
203+
if len(rct.Finalizers) == len(newRCT.Finalizers) {
204+
return nil
213205
}
214-
}
215-
if len(rct.Finalizers) == len(newRCT.Finalizers) {
216-
return nil
217-
}
218206

219-
if _, err = m.config.clientsets.Core.ResourceV1beta1().ResourceClaimTemplates(rct.Namespace).Update(ctx, newRCT, metav1.UpdateOptions{}); err != nil {
220-
return fmt.Errorf("error updating ResourceClaimTemplate: %w", err)
207+
if _, err = m.config.clientsets.Core.ResourceV1beta1().ResourceClaimTemplates(newRCT.Namespace).Update(ctx, newRCT, metav1.UpdateOptions{}); err != nil {
208+
return fmt.Errorf("error updating ResourceClaimTemplate: %w", err)
209+
}
221210
}
222211

223212
return nil
@@ -274,9 +263,6 @@ func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, name
274263
daemonConfig.NumNodes = cd.Spec.NumNodes
275264
daemonConfig.DomainID = string(cd.UID)
276265

277-
channelConfig := nvapi.DefaultComputeDomainChannelConfig()
278-
channelConfig.DomainID = string(cd.UID)
279-
280266
templateData := ResourceClaimTemplateTemplateData{
281267
Namespace: namespace,
282268
GenerateName: fmt.Sprintf("%s-daemon-claim-template-", cd.Name),
@@ -322,16 +308,15 @@ func NewWorkloadResourceClaimTemplateManager(config *ManagerConfig) *WorkloadRes
322308
return m
323309
}
324310

325-
func (m *WorkloadResourceClaimTemplateManager) Create(ctx context.Context, namespace, name string, cd *nvapi.ComputeDomain) (*resourceapi.ResourceClaimTemplate, error) {
311+
func (m *WorkloadResourceClaimTemplateManager) Create(ctx context.Context, namespace, name string, channel int, cd *nvapi.ComputeDomain) (*resourceapi.ResourceClaimTemplate, error) {
326312
rcts, err := getByComputeDomainUID[*resourceapi.ResourceClaimTemplate](ctx, m.informer, string(cd.UID))
327313
if err != nil {
328314
return nil, fmt.Errorf("error retrieving ResourceClaimTemplate: %w", err)
329315
}
330-
if len(rcts) > 1 {
331-
return nil, fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID")
332-
}
333-
if len(rcts) == 1 {
334-
return rcts[0], nil
316+
for _, rct := range rcts {
317+
if rct.Namespace == namespace && rct.Name == name {
318+
return rct, nil
319+
}
335320
}
336321

337322
channelConfig := nvapi.DefaultComputeDomainChannelConfig()
@@ -345,8 +330,9 @@ func (m *WorkloadResourceClaimTemplateManager) Create(ctx context.Context, names
345330
ComputeDomainLabelValue: cd.UID,
346331
TargetLabelKey: computeDomainResourceClaimTemplateTargetLabelKey,
347332
TargetLabelValue: computeDomainResourceClaimTemplateTargetWorkload,
348-
DeviceClassName: computeDomainDefaultChannelDeviceClass,
333+
DeviceClassName: computeDomainChannelDeviceClass,
349334
DriverName: DriverName,
335+
ChannelID: channel,
350336
ChannelConfig: channelConfig,
351337
}
352338

cmd/compute-domain-kubelet-plugin/computedomain.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,10 @@ func (m *ComputeDomainManager) AddNodeLabel(ctx context.Context, cdUID string) e
288288
}
289289

290290
currentValue, exists := node.Labels[computeDomainLabelKey]
291+
if exists && currentValue != cdUID {
292+
return fmt.Errorf("label already exists for a different ComputeDomain")
293+
}
294+
291295
if exists && currentValue == cdUID {
292296
return nil
293297
}

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ func (s *DeviceState) Unprepare(ctx context.Context, claimUID string) error {
181181
return nil
182182
}
183183

184-
if err := s.unprepareDevices(ctx, claimUID, preparedClaims[claimUID]); err != nil {
184+
if err := s.unprepareDevices(ctx, claimUID, preparedClaims); err != nil {
185185
return fmt.Errorf("unprepare devices failed: %w", err)
186186
}
187187

@@ -339,17 +339,20 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
339339
return preparedDevices, nil
340340
}
341341

342-
func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, devices PreparedDevices) error {
342+
func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, preparedClaims PreparedClaims) error {
343343
// Unprepare any ComputeDomain daemons prepared for each group of prepared devices.
344-
for _, group := range devices {
345-
// If a cannel type, remove the ComputeDomain label from the node
344+
for _, group := range preparedClaims[claimUID] {
345+
// If the last channel remaining, remove the ComputeDomain label from the node
346346
if group.ConfigState.Type == ComputeDomainChannelType {
347+
if len(preparedClaims.ComputeDomainChannels()) > 1 {
348+
return nil
349+
}
347350
if err := s.computeDomainManager.RemoveNodeLabel(ctx, group.ConfigState.ComputeDomain); err != nil {
348351
return fmt.Errorf("error removing Node label for ComputeDomain: %w", err)
349352
}
350353
}
351354

352-
// If a daemon type, unprepare the new ComputeDomain daemon.
355+
// If a daemon type, unprepare the new ComputeDomain daemon
353356
if group.ConfigState.Type == ComputeDomainDaemonType {
354357
computeDomainDaemonSettings := s.computeDomainManager.NewSettings(group.ConfigState.ComputeDomain)
355358
if err := computeDomainDaemonSettings.Unprepare(ctx); err != nil {

cmd/compute-domain-kubelet-plugin/driver.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,6 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
6868
// Enumerate the set of ComputeDomain daemon devices and publish them
6969
var resources kubeletplugin.Resources
7070
for _, device := range state.allocatable {
71-
// Explicitly exclude ComputeDomain channels from being advertised here. They
72-
// are instead advertised in as a network resource from the control plane.
73-
if device.Type() == ComputeDomainChannelType && device.Channel.ID != 0 {
74-
continue
75-
}
7671
resources.Devices = append(resources.Devices, device.GetDevice())
7772
}
7873

cmd/compute-domain-kubelet-plugin/nvlib.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,10 @@ func (l deviceLib) enumerateComputeDomainDaemons(config *Config) (AllocatableDev
125125

126126
func (l deviceLib) getImexChannelCount() (int, error) {
127127
// TODO: Pull this value from /proc/driver/nvidia/params
128-
return 2048, nil
128+
// The default is 2048.
129+
// For now limit this to 64 (which is half the maximum number of devices
130+
// allowed in a ResoureSlice)
131+
return 64, nil
129132
}
130133

131134
func (l deviceLib) getImexChannelMajor() (int, error) {

cmd/compute-domain-kubelet-plugin/prepared.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,26 @@ func (l PreparedDeviceList) ComputeDomainDaemons() PreparedDeviceList {
9494
return devices
9595
}
9696

97+
func (c PreparedClaims) ComputeDomainChannels() PreparedDeviceList {
98+
var ds PreparedDeviceList
99+
for _, devices := range c {
100+
for _, group := range devices {
101+
ds = append(ds, group.Devices.ComputeDomainChannels()...)
102+
}
103+
}
104+
return ds
105+
}
106+
107+
func (c PreparedClaims) ComputeDomainDaemons() PreparedDeviceList {
108+
var ds PreparedDeviceList
109+
for _, devices := range c {
110+
for _, group := range devices {
111+
ds = append(ds, group.Devices.ComputeDomainDaemons()...)
112+
}
113+
}
114+
return ds
115+
}
116+
97117
func (d PreparedDevices) GetDevices() []*drapbv1.Device {
98118
var devices []*drapbv1.Device
99119
for _, group := range d {

deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,28 @@ spec:
4242
properties:
4343
numNodes:
4444
type: integer
45-
resourceClaimTemplate:
46-
description: ComputeDomainResourceClaimTemplate provides the details
47-
of the ResourceClaimTemplate to generate.
48-
properties:
49-
name:
50-
type: string
51-
required:
52-
- name
53-
type: object
45+
resourceClaimTemplates:
46+
items:
47+
description: ComputeDomainResourceClaimTemplate provides the details
48+
of the ResourceClaimTemplate to generate.
49+
properties:
50+
name:
51+
type: string
52+
required:
53+
- name
54+
type: object
55+
type: array
5456
required:
5557
- numNodes
56-
- resourceClaimTemplate
58+
- resourceClaimTemplates
5759
type: object
5860
x-kubernetes-validations:
5961
- message: A computeDomain.spec is immutable
6062
rule: self == oldSelf
63+
- message: The 'resourceClaimTemplates' field must have at least one entry.
64+
rule: size(self.resourceClaimTemplates) >= 1
65+
- message: The 'resourceClaimTemplates' field must have less than 64 entries.
66+
rule: size(self.resourceClaimTemplates) < 64
6167
status:
6268
description: ComputeDomainStatus provides the status for a ComputeDomain.
6369
properties:

0 commit comments

Comments
 (0)