Skip to content

Commit 29a4cd8

Browse files
committed
Make maxNodesPerIMEXDomain configurable (default at 18)
Signed-off-by: Kevin Klues <[email protected]>
1 parent 39f94dd commit 29a4cd8

File tree

6 files changed

+52
-22
lines changed

6 files changed

+52
-22
lines changed

cmd/compute-domain-controller/controller.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ type ManagerConfig struct {
3838
// imageName is the full image name to use when rendering templates
3939
imageName string
4040

41+
// maxNodesPerIMEXDomain is the maximum number of nodes per IMEX domain to allocate
42+
maxNodesPerIMEXDomain int
43+
4144
// clientsets provides access to various Kubernetes API client interfaces
4245
clientsets flags.ClientSets
4346

@@ -67,12 +70,13 @@ func (c *Controller) Run(ctx context.Context) error {
6770
workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter())
6871

6972
managerConfig := &ManagerConfig{
70-
driverName: c.config.driverName,
71-
driverNamespace: c.config.flags.namespace,
72-
additionalNamespaces: c.config.flags.additionalNamespaces.Value(),
73-
imageName: c.config.flags.imageName,
74-
clientsets: c.config.clientsets,
75-
workQueue: workQueue,
73+
driverName: c.config.driverName,
74+
driverNamespace: c.config.flags.namespace,
75+
additionalNamespaces: c.config.flags.additionalNamespaces.Value(),
76+
imageName: c.config.flags.imageName,
77+
maxNodesPerIMEXDomain: c.config.flags.maxNodesPerIMEXDomain,
78+
clientsets: c.config.clientsets,
79+
workQueue: workQueue,
7680
}
7781

7882
cdManager := NewComputeDomainManager(managerConfig)

cmd/compute-domain-controller/daemonset.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ type DaemonSetTemplateData struct {
5050
ComputeDomainLabelValue types.UID
5151
ResourceClaimTemplateName string
5252
ImageName string
53+
MaxNodesPerIMEXDomain int
5354
FeatureGates map[string]bool
5455
}
5556

@@ -200,6 +201,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, cd *nvapi.ComputeDomain)
200201
ComputeDomainLabelValue: cd.UID,
201202
ResourceClaimTemplateName: rct.Name,
202203
ImageName: m.config.imageName,
204+
MaxNodesPerIMEXDomain: m.config.maxNodesPerIMEXDomain,
203205
FeatureGates: featuregates.ToMap(),
204206
}
205207

cmd/compute-domain-controller/main.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,23 @@ import (
4444

4545
const (
4646
DriverName = "compute-domain.nvidia.com"
47+
48+
// This constant provides a reasonable default for the maximum size of
49+
// a given IMEX Domain. On GB200 and GB300 the limit is 18, so we pick
50+
// this for now. It can be overridden as an environment variable or
51+
// command line argument as required.
52+
defaultMaxNodesPerIMEXDomain = 18
4753
)
4854

4955
type Flags struct {
5056
kubeClientConfig flags.KubeClientConfig
5157
loggingConfig *flags.LoggingConfig
5258
featureGateConfig *flags.FeatureGateConfig
5359

54-
podName string
55-
namespace string
56-
imageName string
60+
podName string
61+
namespace string
62+
imageName string
63+
maxNodesPerIMEXDomain int
5764

5865
httpEndpoint string
5966
metricsPath string
@@ -103,6 +110,13 @@ func newApp() *cli.App {
103110
Destination: &flags.imageName,
104111
EnvVars: []string{"IMAGE_NAME"},
105112
},
113+
&cli.IntFlag{
114+
Name: "max-nodes-per-imex-domain",
115+
Usage: "The maximum number of possible nodes per IMEX domain",
116+
Value: defaultMaxNodesPerIMEXDomain,
117+
EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"},
118+
Destination: &flags.maxNodesPerIMEXDomain,
119+
},
106120
&cli.StringFlag{
107121
Category: "HTTP server:",
108122
Name: "http-endpoint",

cmd/compute-domain-daemon/dnsnames.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ import (
3030
)
3131

3232
const (
33-
maxDNSNames = 18
3433
hostsFilePath = "/etc/hosts"
3534
dnsNameFormat = "compute-domain-daemon-%d"
3635
)
@@ -41,17 +40,19 @@ type IPToDNSNameMap map[string]string
4140
// DNSNameManager manages the allocation of static DNS names to IP addresses.
4241
type DNSNameManager struct {
4342
sync.Mutex
44-
ipToDNSName IPToDNSNameMap
45-
cliqueID string
46-
nodesConfigPath string
43+
ipToDNSName IPToDNSNameMap
44+
cliqueID string
45+
maxNodesPerIMEXDomain int
46+
nodesConfigPath string
4747
}
4848

4949
// NewDNSNameManager creates a new DNS name manager.
50-
func NewDNSNameManager(cliqueID string, nodesConfigPath string) *DNSNameManager {
50+
func NewDNSNameManager(cliqueID string, maxNodesPerIMEXDomain int, nodesConfigPath string) *DNSNameManager {
5151
return &DNSNameManager{
52-
ipToDNSName: make(IPToDNSNameMap),
53-
cliqueID: cliqueID,
54-
nodesConfigPath: nodesConfigPath,
52+
ipToDNSName: make(IPToDNSNameMap),
53+
cliqueID: cliqueID,
54+
maxNodesPerIMEXDomain: maxNodesPerIMEXDomain,
55+
nodesConfigPath: nodesConfigPath,
5556
}
5657
}
5758

@@ -134,7 +135,7 @@ func (m *DNSNameManager) allocateDNSName(ip string) (string, error) {
134135
}
135136

136137
// Find the next available DNS name
137-
for i := 0; i < maxDNSNames; i++ {
138+
for i := 0; i < m.maxNodesPerIMEXDomain; i++ {
138139
dnsName := fmt.Sprintf(dnsNameFormat, i)
139140
// Check if this DNS name is already in use
140141
inUse := false
@@ -151,7 +152,7 @@ func (m *DNSNameManager) allocateDNSName(ip string) (string, error) {
151152
}
152153

153154
// If all DNS names are used, return an error
154-
return "", fmt.Errorf("no DNS names available (max: %d)", maxDNSNames)
155+
return "", fmt.Errorf("no DNS names available (max: %d)", m.maxNodesPerIMEXDomain)
155156
}
156157

157158
// updateHostsFile updates the /etc/hosts file with current IP to DNS name mappings.
@@ -221,14 +222,14 @@ func (m *DNSNameManager) WriteNodesConfig() error {
221222
defer f.Close()
222223

223224
// Write static DNS names
224-
for i := 0; i < maxDNSNames; i++ {
225+
for i := 0; i < m.maxNodesPerIMEXDomain; i++ {
225226
dnsName := fmt.Sprintf(dnsNameFormat, i)
226227
if _, err := fmt.Fprintf(f, "%s\n", dnsName); err != nil {
227228
return fmt.Errorf("failed to write to nodes config file: %w", err)
228229
}
229230
}
230231

231-
klog.Infof("Created static nodes config file with %d DNS names using format %s", maxDNSNames, dnsNameFormat)
232+
klog.Infof("Created static nodes config file with %d DNS names using format %s", m.maxNodesPerIMEXDomain, dnsNameFormat)
232233

233234
return nil
234235
}

cmd/compute-domain-daemon/main.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ type Flags struct {
5151
computeDomainNamespace string
5252
nodeName string
5353
podIP string
54+
maxNodesPerIMEXDomain int
5455
loggingConfig *flags.LoggingConfig
5556
featureGateConfig *flags.FeatureGateConfig
5657
}
@@ -124,6 +125,12 @@ func newApp() *cli.App {
124125
EnvVars: []string{"POD_IP"},
125126
Destination: &flags.podIP,
126127
},
128+
&cli.IntFlag{
129+
Name: "max-nodes-per-imex-domain",
130+
Usage: "The maximum number of possible nodes per IMEX domain",
131+
EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"},
132+
Destination: &flags.maxNodesPerIMEXDomain,
133+
},
127134
}
128135
cliFlags = append(cliFlags, flags.featureGateConfig.Flags()...)
129136
cliFlags = append(cliFlags, flags.loggingConfig.Flags()...)
@@ -181,7 +188,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
181188
var dnsNameManager *DNSNameManager
182189
if featuregates.Enabled(featuregates.IMEXDaemonsWithDNSNames) {
183190
// Prepare DNS name manager
184-
dnsNameManager = NewDNSNameManager(flags.cliqueID, nodesConfigPath)
191+
dnsNameManager = NewDNSNameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, nodesConfigPath)
185192

186193
// Create static nodes config file with DNS names
187194
if err := dnsNameManager.WriteNodesConfig(); err != nil {

templates/compute-domain-daemon.tmpl.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ spec:
2626
image: {{ .ImageName }}
2727
command: ["compute-domain-daemon", "-v", "6", "run"]
2828
env:
29+
- name: MAX_NODES_PER_IMEX_DOMAIN
30+
value: "{{ .MaxNodesPerIMEXDomain }}"
2931
- name: NODE_NAME
3032
valueFrom:
3133
fieldRef:

0 commit comments

Comments
 (0)