Skip to content

Commit 0a2bc66

Browse files
committed
Make maxNodesPerIMEXDomain configurable (default at 18)
Signed-off-by: Kevin Klues <[email protected]>
1 parent b8b2a06 commit 0a2bc66

File tree

6 files changed

+47
-21
lines changed

6 files changed

+47
-21
lines changed

cmd/compute-domain-controller/controller.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ type ManagerConfig struct {
3838
// imageName is the full image name to use when rendering templates
3939
imageName string
4040

41+
// maxNodesPerIMEXDomain is the maximum number of nodes per IMEX domain to allocate
42+
maxNodesPerIMEXDomain int
43+
4144
// clientsets provides access to various Kubernetes API client interfaces
4245
clientsets flags.ClientSets
4346

@@ -63,11 +66,12 @@ func (c *Controller) Run(ctx context.Context) error {
6366
workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter())
6467

6568
managerConfig := &ManagerConfig{
66-
driverName: c.config.driverName,
67-
driverNamespace: c.config.flags.namespace,
68-
imageName: c.config.flags.imageName,
69-
clientsets: c.config.clientsets,
70-
workQueue: workQueue,
69+
driverName: c.config.driverName,
70+
driverNamespace: c.config.flags.namespace,
71+
imageName: c.config.flags.imageName,
72+
maxNodesPerIMEXDomain: c.config.flags.maxNodesPerIMEXDomain,
73+
clientsets: c.config.clientsets,
74+
workQueue: workQueue,
7175
}
7276

7377
cdManager := NewComputeDomainManager(managerConfig)

cmd/compute-domain-controller/daemonset.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ type DaemonSetTemplateData struct {
4949
ComputeDomainLabelValue types.UID
5050
ResourceClaimTemplateName string
5151
ImageName string
52+
MaxNodesPerIMEXDomain int
5253
}
5354

5455
type DaemonSetManager struct {
@@ -181,6 +182,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva
181182
ComputeDomainLabelValue: cd.UID,
182183
ResourceClaimTemplateName: rct.Name,
183184
ImageName: m.config.imageName,
185+
MaxNodesPerIMEXDomain: m.config.maxNodesPerIMEXDomain,
184186
}
185187

186188
tmpl, err := template.ParseFiles(DaemonSetTemplatePath)

cmd/compute-domain-controller/main.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,18 @@ import (
4343
)
4444

4545
const (
46-
DriverName = "compute-domain.nvidia.com"
46+
DriverName = "compute-domain.nvidia.com"
47+
defaultMaxNodesPerIMEXDomain = 18
4748
)
4849

4950
type Flags struct {
5051
kubeClientConfig flags.KubeClientConfig
5152
loggingConfig *flags.LoggingConfig
5253

53-
podName string
54-
namespace string
55-
imageName string
54+
podName string
55+
namespace string
56+
imageName string
57+
maxNodesPerIMEXDomain int
5658

5759
httpEndpoint string
5860
metricsPath string
@@ -99,6 +101,13 @@ func newApp() *cli.App {
99101
Destination: &flags.imageName,
100102
EnvVars: []string{"IMAGE_NAME"},
101103
},
104+
&cli.IntFlag{
105+
Name: "max-nodes-per-imex-domain",
106+
Usage: "The maximum number of possible nodes per IMEX domain",
107+
Value: defaultMaxNodesPerIMEXDomain,
108+
EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"},
109+
Destination: &flags.maxNodesPerIMEXDomain,
110+
},
102111
&cli.StringFlag{
103112
Category: "HTTP server:",
104113
Name: "http-endpoint",

cmd/compute-domain-daemon/hostname.go

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,19 @@ const (
3737
// HostnameManager manages the allocation of static hostnames to IP addresses.
3838
type HostnameManager struct {
3939
sync.Mutex
40-
ipToHostname map[string]string
41-
cliqueID string
42-
nodesConfigPath string
40+
ipToHostname map[string]string
41+
cliqueID string
42+
maxNodesPerIMEXDomain int
43+
nodesConfigPath string
4344
}
4445

4546
// NewHostnameManager creates a new hostname manager.
46-
func NewHostnameManager(cliqueID string, nodesConfigPath string) *HostnameManager {
47+
func NewHostnameManager(cliqueID string, maxNodesPerIMEXDomain int, nodesConfigPath string) *HostnameManager {
4748
return &HostnameManager{
48-
ipToHostname: make(map[string]string),
49-
cliqueID: cliqueID,
50-
nodesConfigPath: nodesConfigPath,
49+
ipToHostname: make(map[string]string),
50+
cliqueID: cliqueID,
51+
maxNodesPerIMEXDomain: maxNodesPerIMEXDomain,
52+
nodesConfigPath: nodesConfigPath,
5153
}
5254
}
5355

@@ -118,7 +120,7 @@ func (m *HostnameManager) allocateHostname(ip string) (string, error) {
118120
}
119121

120122
// Find the next available hostname
121-
for i := 0; i < maxHostnames; i++ {
123+
for i := 0; i < m.maxNodesPerIMEXDomain; i++ {
122124
hostname := fmt.Sprintf(hostnameFormat, i)
123125
// Check if this hostname is already in use
124126
inUse := false
@@ -135,7 +137,7 @@ func (m *HostnameManager) allocateHostname(ip string) (string, error) {
135137
}
136138

137139
// If all hostnames are used, return an error
138-
return "", fmt.Errorf("no hostnames available (max: %d)", maxHostnames)
140+
return "", fmt.Errorf("no hostnames available (max: %d)", m.maxNodesPerIMEXDomain)
139141
}
140142

141143
// updateHostsFile updates the /etc/hosts file with current IP to hostname mappings.
@@ -205,13 +207,13 @@ func (m *HostnameManager) WriteNodesConfig() error {
205207
defer f.Close()
206208

207209
// Write static hostnames
208-
for i := 0; i < maxHostnames; i++ {
210+
for i := 0; i < m.maxNodesPerIMEXDomain; i++ {
209211
hostname := fmt.Sprintf(hostnameFormat, i)
210212
if _, err := fmt.Fprintf(f, "%s\n", hostname); err != nil {
211213
return fmt.Errorf("failed to write to nodes config file: %w", err)
212214
}
213215
}
214216

215-
klog.Infof("Created static nodes config file with %d hostnames using format %s", maxHostnames, hostnameFormat)
217+
klog.Infof("Created static nodes config file with %d hostnames using format %s", m.maxNodesPerIMEXDomain, hostnameFormat)
216218
return nil
217219
}

cmd/compute-domain-daemon/main.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ type Flags struct {
4848
computeDomainNamespace string
4949
nodeName string
5050
podIP string
51+
maxNodesPerIMEXDomain int
5152
loggingConfig *flags.LoggingConfig
5253
}
5354

@@ -119,6 +120,12 @@ func newApp() *cli.App {
119120
EnvVars: []string{"POD_IP"},
120121
Destination: &flags.podIP,
121122
},
123+
&cli.IntFlag{
124+
Name: "max-nodes-per-imex-domain",
125+
Usage: "The maximum number of possible nodes per IMEX domain",
126+
EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"},
127+
Destination: &flags.maxNodesPerIMEXDomain,
128+
},
122129
}
123130
cliFlags = append(cliFlags, flags.loggingConfig.Flags()...)
124131

@@ -173,7 +180,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
173180
klog.Infof("config: %v", config)
174181

175182
// Prepare Hostname manager
176-
hostnameManager := NewHostnameManager(flags.cliqueID, nodesConfigPath)
183+
hostnameManager := NewHostnameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, nodesConfigPath)
177184

178185
// Create static nodes config file with hostnames
179186
if err := hostnameManager.WriteNodesConfig(); err != nil {

templates/compute-domain-daemon.tmpl.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ spec:
2626
image: {{ .ImageName }}
2727
command: ["compute-domain-daemon", "-v", "6", "run"]
2828
env:
29+
- name: MAX_NODES_PER_IMEX_DOMAIN
30+
value: "{{ .MaxNodesPerIMEXDomain }}"
2931
- name: NODE_NAME
3032
valueFrom:
3133
fieldRef:

0 commit comments

Comments
 (0)