Skip to content

Commit b80afe7

Browse files
Merge pull request #187 from ArangoGutierrez/107/132
Ensure each imex domain.cliqueId has a unique set of channel numbers
2 parents 6f74ac8 + 252bb50 commit b80afe7

File tree

21 files changed

+387
-123
lines changed

21 files changed

+387
-123
lines changed

cmd/nvidia-dra-controller/imex.go

Lines changed: 83 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package main
1919
import (
2020
"context"
2121
"fmt"
22+
"strings"
2223
"sync"
2324
"time"
2425

@@ -36,17 +37,20 @@ import (
3637
)
3738

3839
const (
39-
DriverName = "gpu.nvidia.com"
40-
ImexDomainLabel = "nvidia.com/gpu.imex-domain"
41-
ImexChannelLimit = 128
40+
DriverName = "gpu.nvidia.com"
41+
ImexDomainLabel = "nvidia.com/gpu.imex-domain"
42+
ResourceSliceImexChannelLimit = 128
43+
DriverImexChannelLimit = 2048
4244
)
4345

4446
type ImexManager struct {
4547
waitGroup sync.WaitGroup
4648
clientset kubernetes.Interface
4749
}
4850

49-
type DriverResources resourceslice.DriverResources
51+
// imexDomainOffsets represents the offset for assigning IMEX channels
52+
// to ResourceSlices for each <imex-domain, cliqueid> combination.
53+
type imexDomainOffsets map[string]map[string]int
5054

5155
func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error) {
5256
// Build a client set config
@@ -99,29 +103,34 @@ func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error)
99103

100104
// manageResourceSlices reacts to added and removed IMEX domains and triggers the creation / removal of resource slices accordingly.
101105
func (m *ImexManager) manageResourceSlices(ctx context.Context, owner resourceslice.Owner, addedDomainsCh <-chan string, removedDomainsCh <-chan string) error {
102-
driverResources := resourceslice.DriverResources{}
103-
controller, err := resourceslice.StartController(ctx, m.clientset, DriverName, owner, &driverResources)
106+
driverResources := &resourceslice.DriverResources{}
107+
controller, err := resourceslice.StartController(ctx, m.clientset, DriverName, owner, driverResources)
104108
if err != nil {
105109
return fmt.Errorf("error starting resource slice controller: %w", err)
106110
}
107111

112+
imexDomainOffsets := new(imexDomainOffsets)
108113
m.waitGroup.Add(1)
109114
go func() {
110115
defer m.waitGroup.Done()
111116
for {
112117
select {
113118
case addedDomain := <-addedDomainsCh:
119+
offset, err := imexDomainOffsets.add(addedDomain, ResourceSliceImexChannelLimit, DriverImexChannelLimit)
120+
if err != nil {
121+
klog.Errorf("Error calculating channel offset for IMEX domain %s: %v", addedDomain, err)
122+
return
123+
}
114124
klog.Infof("Adding channels for new IMEX domain: %v", addedDomain)
115-
newDriverResources := DriverResources(driverResources).DeepCopy()
116-
newDriverResources.Pools[addedDomain] = generateImexChannelPool(addedDomain, ImexChannelLimit)
117-
controller.Update(&newDriverResources)
118-
driverResources = newDriverResources
125+
driverResources := driverResources.DeepCopy()
126+
driverResources.Pools[addedDomain] = generateImexChannelPool(addedDomain, offset, ResourceSliceImexChannelLimit)
127+
controller.Update(driverResources)
119128
case removedDomain := <-removedDomainsCh:
120129
klog.Infof("Removing channels for removed IMEX domain: %v", removedDomain)
121-
newDriverResources := DriverResources(driverResources).DeepCopy()
122-
delete(newDriverResources.Pools, removedDomain)
123-
controller.Update(&newDriverResources)
124-
driverResources = newDriverResources
130+
driverResources := driverResources.DeepCopy()
131+
delete(driverResources.Pools, removedDomain)
132+
imexDomainOffsets.remove(removedDomain)
133+
controller.Update(driverResources)
125134
case <-ctx.Done():
126135
return
127136
}
@@ -146,17 +155,6 @@ func (m *ImexManager) Stop() error {
146155
return nil
147156
}
148157

149-
// DeepCopy will perform a deep copy of the provided DriverResources.
150-
func (d DriverResources) DeepCopy() resourceslice.DriverResources {
151-
driverResources := resourceslice.DriverResources{
152-
Pools: make(map[string]resourceslice.Pool),
153-
}
154-
for p := range d.Pools {
155-
driverResources.Pools[p] = generateImexChannelPool(p, ImexChannelLimit)
156-
}
157-
return driverResources
158-
}
159-
160158
// streamImexDomains returns two channels that streams imexDomans that are added and removed from nodes over time.
161159
func (m *ImexManager) streamImexDomains(ctx context.Context) (<-chan string, <-chan string, error) {
162160
// Create channels to stream IMEX domain ids that are added / removed
@@ -249,10 +247,10 @@ func (m *ImexManager) streamImexDomains(ctx context.Context) (<-chan string, <-c
249247
}
250248

251249
// generateImexChannelPool generates the contents of a ResourceSlice pool for a given IMEX domain.
252-
func generateImexChannelPool(imexDomain string, numChannels int) resourceslice.Pool {
253-
// Generate dchannels from 0 to numChannels
250+
func generateImexChannelPool(imexDomain string, startChannel int, numChannels int) resourceslice.Pool {
251+
// Generate channels from startChannel to offset+numChannels
254252
var devices []resourceapi.Device
255-
for i := 0; i < numChannels; i++ {
253+
for i := startChannel; i < (startChannel + numChannels); i++ {
256254
d := resourceapi.Device{
257255
Name: fmt.Sprintf("imex-channel-%d", i),
258256
Basic: &resourceapi.BasicDevice{
@@ -312,3 +310,60 @@ func (m *ImexManager) cleanupResourceSlices() error {
312310

313311
return nil
314312
}
313+
314+
// add sets the offset where an IMEX domain's channels should start counting from.
315+
func (offsets imexDomainOffsets) add(imexDomain string, resourceSliceImexChannelLimit, driverImexChannelLimit int) (int, error) {
316+
// Split the incoming imexDomain to split off its cliqueID
317+
id := strings.SplitN(imexDomain, ".", 2)
318+
if len(id) != 2 {
319+
return -1, fmt.Errorf("error adding IMEX domain %s: invalid format", imexDomain)
320+
}
321+
imexDomain = id[0]
322+
cliqueID := id[1]
323+
324+
// Check if the IMEX domain is already in the map
325+
if _, ok := offsets[imexDomain]; !ok {
326+
offsets[imexDomain] = make(map[string]int)
327+
}
328+
329+
// Return early if the clique is already in the map
330+
if offset, exists := offsets[imexDomain][cliqueID]; exists {
331+
return offset, nil
332+
}
333+
334+
// Track used offsets for the current imexDomain
335+
usedOffsets := make(map[int]struct{})
336+
for _, v := range offsets[imexDomain] {
337+
usedOffsets[v] = struct{}{}
338+
}
339+
340+
// Look for the first unused offset, stepping by resourceSliceImexChannelLimit
341+
var offset int
342+
for offset = 0; offset < driverImexChannelLimit; offset += resourceSliceImexChannelLimit {
343+
if _, exists := usedOffsets[offset]; !exists {
344+
break
345+
}
346+
}
347+
348+
// If we reach the limit, return an error
349+
if offset == driverImexChannelLimit {
350+
return -1, fmt.Errorf("error adding IMEX domain %s: channel limit reached", imexDomain)
351+
}
352+
offsets[imexDomain][cliqueID] = offset
353+
354+
return offset, nil
355+
}
356+
357+
func (offsets imexDomainOffsets) remove(imexDomain string) {
358+
id := strings.SplitN(imexDomain, ".", 2)
359+
if len(id) != 2 {
360+
return
361+
}
362+
imexDomain = id[0]
363+
cliqueID := id[1]
364+
365+
delete(offsets[imexDomain], cliqueID)
366+
if len(offsets[imexDomain]) == 0 {
367+
delete(offsets, imexDomain)
368+
}
369+
}

go.mod

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module github.com/NVIDIA/k8s-dra-driver
22

33
go 1.23.1
44

5-
replace k8s.io/dynamic-resource-allocation => github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241014110620-faf89fe5e93b
5+
replace k8s.io/dynamic-resource-allocation => github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241025114252-8b063a6a08bc
66

77
require (
88
github.com/Masterminds/semver v1.5.0
@@ -56,11 +56,11 @@ require (
5656
github.com/josharian/intern v1.0.0 // indirect
5757
github.com/json-iterator/go v1.1.12 // indirect
5858
github.com/mailru/easyjson v0.7.7 // indirect
59-
github.com/moby/sys/mountinfo v0.7.1 // indirect
59+
github.com/moby/sys/mountinfo v0.7.2 // indirect
6060
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
6161
github.com/modern-go/reflect2 v1.0.2 // indirect
6262
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
63-
github.com/opencontainers/runc v1.1.13 // indirect
63+
github.com/opencontainers/runc v1.1.15 // indirect
6464
github.com/opencontainers/runtime-spec v1.2.0 // indirect
6565
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
6666
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
@@ -73,8 +73,8 @@ require (
7373
github.com/x448/float16 v0.8.4 // indirect
7474
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
7575
go.uber.org/multierr v1.11.0 // indirect
76-
go.uber.org/zap v1.26.0 // indirect
77-
golang.org/x/mod v0.20.0 // indirect
76+
go.uber.org/zap v1.27.0 // indirect
77+
golang.org/x/mod v0.21.0 // indirect
7878
golang.org/x/net v0.30.0 // indirect
7979
golang.org/x/oauth2 v0.23.0 // indirect
8080
golang.org/x/term v0.25.0 // indirect

go.sum

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,13 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
8585
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
8686
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
8787
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
88-
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241014110620-faf89fe5e93b h1:S43DzJ7/nhmV+gH1DacCljYoJu6WztK1Fjpz9iUmoPQ=
89-
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241014110620-faf89fe5e93b/go.mod h1:8g5MWHFI0UiuvtrclYyS4K8EA55VrU1/4lBG6HIQ2cI=
88+
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241025114252-8b063a6a08bc h1:7Y3cSZsYslBJhAPVsseOhrU9RxJuCdzgaMYvoIDuGUA=
89+
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241025114252-8b063a6a08bc/go.mod h1:dZVQfU+lUXza85oigVyOmZXj/xsFIon0O6/NamFg82M=
9090
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
9191
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
9292
github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34fGzaAZGFW22KVZDfyrYW+QABMrWnJBnSs=
93-
github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g=
94-
github.com/moby/sys/mountinfo v0.7.1/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI=
93+
github.com/moby/sys/mountinfo v0.7.2 h1:1shs6aH5s4o5H2zQLn796ADW1wMrIwHsyJ2v9KouLrg=
94+
github.com/moby/sys/mountinfo v0.7.2/go.mod h1:1YOa8w8Ih7uW0wALDUgT1dTTSBrZ+HiBLGws92L2RU4=
9595
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
9696
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
9797
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -104,8 +104,8 @@ github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA
104104
github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
105105
github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
106106
github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
107-
github.com/opencontainers/runc v1.1.13 h1:98S2srgG9vw0zWcDpFMn5TRrh8kLxa/5OFUstuUhmRs=
108-
github.com/opencontainers/runc v1.1.13/go.mod h1:R016aXacfp/gwQBYw2FDGa9m+n6atbLWrYY8hNMT/sA=
107+
github.com/opencontainers/runc v1.1.15 h1:QMmSU2q1YUg3iOJX11phnaDi2A5/zhx4BR6h+XZ1DMA=
108+
github.com/opencontainers/runc v1.1.15/go.mod h1:E4C2z+7BxR7GHXp0hAY53mek+x49X1LjPNeMTfRGvOA=
109109
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
110110
github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
111111
github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
@@ -171,17 +171,17 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
171171
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
172172
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
173173
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
174-
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
175-
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
174+
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
175+
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
176176
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
177177
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
178178
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
179179
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
180180
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
181181
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
182182
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
183-
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
184-
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
183+
golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
184+
golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
185185
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
186186
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
187187
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
@@ -200,7 +200,6 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
200200
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
201201
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
202202
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
203-
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
204203
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
205204
golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
206205
golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=

vendor/github.com/moby/sys/mountinfo/mounted_linux.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/go.uber.org/zap/.golangci.yml

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/go.uber.org/zap/.readme.tmpl

Lines changed: 9 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)