Skip to content

Commit b8b2a06

Browse files
committed
Add support for using hostnames instead of raw IPs for imex daemons
Signed-off-by: Kevin Klues <[email protected]>
1 parent 4f10463 commit b8b2a06

File tree

3 files changed

+246
-63
lines changed

3 files changed

+246
-63
lines changed
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
/*
2+
* Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"fmt"
21+
"os"
22+
"path/filepath"
23+
"strings"
24+
"sync"
25+
26+
"k8s.io/klog/v2"
27+
28+
nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
29+
)
30+
31+
const (
32+
maxHostnames = 18
33+
hostsFilePath = "/etc/hosts"
34+
hostnameFormat = "compute-domain-daemon-%d"
35+
)
36+
37+
// HostnameManager manages the allocation of static hostnames to IP addresses.
38+
type HostnameManager struct {
39+
sync.Mutex
40+
ipToHostname map[string]string
41+
cliqueID string
42+
nodesConfigPath string
43+
}
44+
45+
// NewHostnameManager creates a new hostname manager.
46+
func NewHostnameManager(cliqueID string, nodesConfigPath string) *HostnameManager {
47+
return &HostnameManager{
48+
ipToHostname: make(map[string]string),
49+
cliqueID: cliqueID,
50+
nodesConfigPath: nodesConfigPath,
51+
}
52+
}
53+
54+
// UpdateHostnameMappings updates the /etc/hosts file with IP to hostname mappings.
55+
func (m *HostnameManager) UpdateHostnameMappings(nodes []*nvapi.ComputeDomainNode) error {
56+
m.Lock()
57+
defer m.Unlock()
58+
59+
// Prefilter nodes to only consider those with the matching cliqueID
60+
var cliqueNodes []*nvapi.ComputeDomainNode
61+
for _, node := range nodes {
62+
if node.CliqueID == m.cliqueID {
63+
cliqueNodes = append(cliqueNodes, node)
64+
}
65+
}
66+
67+
// Find and remove stale IPs from map
68+
currentIPs := make(map[string]bool)
69+
for _, node := range cliqueNodes {
70+
currentIPs[node.IPAddress] = true
71+
}
72+
73+
for ip := range m.ipToHostname {
74+
if !currentIPs[ip] {
75+
delete(m.ipToHostname, ip)
76+
}
77+
}
78+
79+
// Add new IPs to map (filling in holes where others were removed)
80+
for _, node := range cliqueNodes {
81+
// If IP already has a hostname, skip it
82+
if _, exists := m.ipToHostname[node.IPAddress]; exists {
83+
continue
84+
}
85+
86+
hostname, err := m.allocateHostname(node.IPAddress)
87+
if err != nil {
88+
return fmt.Errorf("failed to allocate hostname for IP %s: %w", node.IPAddress, err)
89+
}
90+
m.ipToHostname[node.IPAddress] = hostname
91+
}
92+
93+
// Update the hosts file with current mappings
94+
return m.updateHostsFile()
95+
}
96+
97+
// LogHostnameMappings logs the current compute-domain-daemon mappings from memory.
98+
func (m *HostnameManager) LogHostnameMappings() {
99+
m.Lock()
100+
defer m.Unlock()
101+
102+
if len(m.ipToHostname) == 0 {
103+
klog.Infof("No compute-domain-daemon mappings found")
104+
return
105+
}
106+
107+
klog.Infof("Current compute-domain-daemon mappings:")
108+
for ip, hostname := range m.ipToHostname {
109+
klog.Infof(" %s -> %s", ip, hostname)
110+
}
111+
}
112+
113+
// allocateHostname allocates a hostname for an IP address, reusing existing hostnames if possible.
114+
func (m *HostnameManager) allocateHostname(ip string) (string, error) {
115+
// If IP already has a hostname, return it
116+
if hostname, exists := m.ipToHostname[ip]; exists {
117+
return hostname, nil
118+
}
119+
120+
// Find the next available hostname
121+
for i := 0; i < maxHostnames; i++ {
122+
hostname := fmt.Sprintf(hostnameFormat, i)
123+
// Check if this hostname is already in use
124+
inUse := false
125+
for _, existingHostname := range m.ipToHostname {
126+
if existingHostname == hostname {
127+
inUse = true
128+
break
129+
}
130+
}
131+
if !inUse {
132+
m.ipToHostname[ip] = hostname
133+
return hostname, nil
134+
}
135+
}
136+
137+
// If all hostnames are used, return an error
138+
return "", fmt.Errorf("no hostnames available (max: %d)", maxHostnames)
139+
}
140+
141+
// updateHostsFile updates the /etc/hosts file with current IP to hostname mappings.
142+
func (m *HostnameManager) updateHostsFile() error {
143+
// Read hosts file
144+
hostsContent, err := os.ReadFile(hostsFilePath)
145+
if err != nil {
146+
return fmt.Errorf("failed to read %s: %w", hostsFilePath, err)
147+
}
148+
149+
// Grab any lines to preserve, skipping existing hostname mappings
150+
var preservedLines []string
151+
for _, line := range strings.Split(string(hostsContent), "\n") {
152+
line = strings.TrimSpace(line)
153+
154+
// Keep empty lines and comments
155+
if line == "" || strings.HasPrefix(line, "#") {
156+
preservedLines = append(preservedLines, line)
157+
continue
158+
}
159+
160+
// Skip existing compute-domain-daemon mappings
161+
if strings.Contains(line, "compute-domain-daemon-") {
162+
continue
163+
}
164+
165+
// Keep all other lines
166+
preservedLines = append(preservedLines, line)
167+
}
168+
169+
// Add preserved lines
170+
var newHostsContent strings.Builder
171+
for _, line := range preservedLines {
172+
newHostsContent.WriteString(line)
173+
newHostsContent.WriteString("\n")
174+
}
175+
176+
// Add a separator comment
177+
newHostsContent.WriteString("# Compute Domain Daemon mappings\n")
178+
179+
// Add new hostname mappings
180+
for ip, hostname := range m.ipToHostname {
181+
newHostsContent.WriteString(fmt.Sprintf("%s\t%s\n", ip, hostname))
182+
}
183+
184+
// Write the updated hosts file
185+
if err := os.WriteFile(hostsFilePath, []byte(newHostsContent.String()), 0644); err != nil {
186+
return fmt.Errorf("failed to write %s: %w", hostsFilePath, err)
187+
}
188+
189+
return nil
190+
}
191+
192+
// WriteNodesConfig creates a static nodes config file with hostnames.
193+
func (m *HostnameManager) WriteNodesConfig() error {
194+
// Ensure the directory exists
195+
dir := filepath.Dir(m.nodesConfigPath)
196+
if err := os.MkdirAll(dir, 0755); err != nil {
197+
return fmt.Errorf("failed to create directory %s: %w", dir, err)
198+
}
199+
200+
// Create or overwrite the nodesConfig file
201+
f, err := os.Create(m.nodesConfigPath)
202+
if err != nil {
203+
return fmt.Errorf("failed to create nodes config file: %w", err)
204+
}
205+
defer f.Close()
206+
207+
// Write static hostnames
208+
for i := 0; i < maxHostnames; i++ {
209+
hostname := fmt.Sprintf(hostnameFormat, i)
210+
if _, err := fmt.Fprintf(f, "%s\n", hostname); err != nil {
211+
return fmt.Errorf("failed to write to nodes config file: %w", err)
212+
}
213+
}
214+
215+
klog.Infof("Created static nodes config file with %d hostnames using format %s", maxHostnames, hostnameFormat)
216+
return nil
217+
}

cmd/compute-domain-daemon/main.go

Lines changed: 18 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"os"
2323
"os/exec"
2424
"os/signal"
25-
"path/filepath"
2625
"strings"
2726
"sync"
2827
"syscall"
@@ -32,7 +31,6 @@ import (
3231
"github.com/Masterminds/semver"
3332
"github.com/urfave/cli/v2"
3433

35-
nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
3634
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
3735
)
3836

@@ -155,7 +153,6 @@ func newApp() *cli.App {
155153

156154
// Run invokes the IMEX daemon and manages its lifecycle.
157155
func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
158-
159156
// Support heterogeneous compute domain
160157
if flags.cliqueID == "" {
161158
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")
@@ -175,7 +172,15 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
175172
}
176173
klog.Infof("config: %v", config)
177174

178-
// Prepare IMEX daemon process manager (not invoking the process yet).
175+
// Prepare Hostname manager
176+
hostnameManager := NewHostnameManager(flags.cliqueID, nodesConfigPath)
177+
178+
// Create static nodes config file with hostnames
179+
if err := hostnameManager.WriteNodesConfig(); err != nil {
180+
return fmt.Errorf("failed to create static nodes config: %w", err)
181+
}
182+
183+
// Prepare IMEX daemon process manager.
179184
daemonCommandLine := []string{imexBinaryPath, "-c", imexConfigPath}
180185
processManager := NewProcessManager(daemonCommandLine)
181186

@@ -198,11 +203,11 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
198203
}()
199204

200205
// Start IMEXDaemonUpdateLoop() in goroutine (watches for CD status
201-
// changes, and restarts the IMEX daemon as needed).
206+
// changes, and updates /etc/hosts with IP to hostname mappings).
202207
wg.Add(1)
203208
go func() {
204209
defer wg.Done()
205-
if err := IMEXDaemonUpdateLoop(ctx, controller, flags.cliqueID, processManager); err != nil {
210+
if err := IMEXDaemonUpdateLoop(ctx, controller, processManager, hostnameManager); err != nil {
206211
klog.Errorf("IMEXDaemonUpdateLoop failed, initiate shutdown: %s", err)
207212
cancel()
208213
}
@@ -227,25 +232,22 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
227232
}
228233

229234
// IMEXDaemonUpdateLoop() reacts to ComputeDomain status changes by updating the
230-
// IMEX daemon nodes config file and (re)starting the IMEX daemon process.
231-
func IMEXDaemonUpdateLoop(ctx context.Context, controller *Controller, cliqueID string, pm *ProcessManager) error {
235+
// /etc/hosts file with IP to hostname mappings and restarting the IMEX daemon.
236+
func IMEXDaemonUpdateLoop(ctx context.Context, controller *Controller, processManager *ProcessManager, hostnameManager *HostnameManager) error {
232237
for {
233238
klog.Infof("wait for nodes update")
234239
select {
235240
case <-ctx.Done():
236241
klog.Infof("shutdown: stop IMEXDaemonUpdateLoop")
237242
return nil
238243
case nodes := <-controller.GetNodesUpdateChan():
239-
if err := writeNodesConfig(cliqueID, nodes); err != nil {
240-
return fmt.Errorf("writeNodesConfig failed: %w", err)
244+
if err := hostnameManager.UpdateHostnameMappings(nodes); err != nil {
245+
return fmt.Errorf("failed to update hostname => IP mappings: %w", err)
241246
}
242-
243-
klog.Infof("Got update, (re)start IMEX daemon")
244-
if err := pm.Restart(); err != nil {
245-
// This might be a permanent problem, and retrying upon next update
246-
// might be pointless. Terminate us.
247-
return fmt.Errorf("error (re)starting IMEX daemon: %w", err)
247+
if err := processManager.EnsureStarted(); err != nil {
248+
return fmt.Errorf("failed to ensure IMEX daemon is started: %w", err)
248249
}
250+
hostnameManager.LogHostnameMappings()
249251
}
250252
}
251253
}
@@ -288,50 +290,6 @@ func check(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
288290
return nil
289291
}
290292

291-
// cwriteNodesConfig creates a nodesConfig file with IPs for nodes in the same clique.
292-
func writeNodesConfig(cliqueID string, nodes []*nvapi.ComputeDomainNode) error {
293-
// Ensure the directory exists
294-
dir := filepath.Dir(nodesConfigPath)
295-
if err := os.MkdirAll(dir, 0755); err != nil {
296-
return fmt.Errorf("failed to create directory %s: %w", dir, err)
297-
}
298-
299-
// Create or overwrite the nodesConfig file
300-
f, err := os.Create(nodesConfigPath)
301-
if err != nil {
302-
return fmt.Errorf("failed to create nodes config file: %w", err)
303-
}
304-
defer f.Close()
305-
306-
// Write IPs for nodes in the same clique
307-
//
308-
// Note(JP): wo we need to apply this type of filtering also in the logic
309-
// that checks if an IMEX daemon restart is required?
310-
for _, node := range nodes {
311-
if node.CliqueID == cliqueID {
312-
if _, err := fmt.Fprintf(f, "%s\n", node.IPAddress); err != nil {
313-
return fmt.Errorf("failed to write to nodes config file: %w", err)
314-
}
315-
}
316-
}
317-
318-
if err := logNodesConfig(); err != nil {
319-
return fmt.Errorf("logNodesConfig failed: %w", err)
320-
}
321-
return nil
322-
}
323-
324-
// Read and log the contents of the nodes configuration file. Return an error if
325-
// the file cannot be read.
326-
func logNodesConfig() error {
327-
content, err := os.ReadFile(nodesConfigPath)
328-
if err != nil {
329-
return fmt.Errorf("failed to read nodes config: %w", err)
330-
}
331-
klog.Infof("Current %s:\n%s", nodesConfigPath, string(content))
332-
return nil
333-
}
334-
335293
// getIMEXVersion returns the version of the NVIDIA IMEX binary.
336294
func getIMEXVersion(ctx context.Context) (*semver.Version, error) {
337295
cmd := exec.CommandContext(ctx, imexBinaryPath, "--version")

cmd/compute-domain-daemon/process.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ func NewProcessManager(cmd []string) *ProcessManager {
4646
return m
4747
}
4848

49-
// Restart() starts or restarts the process.
50-
func (m *ProcessManager) Restart() error {
49+
// restart starts or restarts the process. This is an internal method.
50+
func (m *ProcessManager) restart() error {
5151
if m.handle != nil {
5252
if err := m.stop(); err != nil {
5353
return fmt.Errorf("restart: stop failed: %w", err)
@@ -56,6 +56,14 @@ func (m *ProcessManager) Restart() error {
5656
return m.start()
5757
}
5858

59+
// EnsureStarted starts the process if it is not already running. If the process is already started, this is a no-op.
60+
func (m *ProcessManager) EnsureStarted() error {
61+
if m.handle != nil {
62+
return nil
63+
}
64+
return m.start()
65+
}
66+
5967
func (m *ProcessManager) start() error {
6068
m.Lock()
6169
defer m.Unlock()
@@ -170,7 +178,7 @@ func (m *ProcessManager) Watchdog(ctx context.Context) error {
170178
}
171179

172180
klog.Warningf("Watchdog: start process again")
173-
if err := m.Restart(); err != nil {
181+
if err := m.restart(); err != nil {
174182
return fmt.Errorf("watchdog: process lost, restart failed, treat fatal: %w", err)
175183
}
176184

0 commit comments

Comments
 (0)