Skip to content

Commit 4b5bc74

Browse files
committed
Move logic to run / check status of IMEX daemons into go binary
Signed-off-by: Kevin Klues <[email protected]>
1 parent 8bfb9ec commit 4b5bc74

File tree

4 files changed

+315
-34
lines changed

4 files changed

+315
-34
lines changed

cmd/compute-domain-daemon/main.go

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
/*
2+
* Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"os"
23+
"os/exec"
24+
"os/signal"
25+
"syscall"
26+
27+
"github.com/google/uuid"
28+
"github.com/urfave/cli/v2"
29+
30+
nvdev "github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
31+
"github.com/NVIDIA/go-nvml/pkg/nvml"
32+
)
33+
34+
const (
35+
nodesConfig = "/etc/nvidia-imex/nodes_config.cfg"
36+
imexConfig = "/etc/nvidia-imex/config.cfg"
37+
imexLog = "/var/log/nvidia-imex.log"
38+
imexBinary = "/usr/bin/nvidia-imex"
39+
imexCtl = "/usr/bin/nvidia-imex-ctl"
40+
)
41+
42+
func main() {
43+
if err := newApp().Run(os.Args); err != nil {
44+
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
45+
os.Exit(1)
46+
}
47+
}
48+
49+
func newApp() *cli.App {
50+
// Create a wrapper that will be used to gracefully shut down all subcommands
51+
wrapper := func(ctx context.Context, f func(ctx context.Context) error) error {
52+
// Create a cancelable context from the one passed in
53+
ctx, cancel := context.WithCancel(ctx)
54+
defer cancel()
55+
56+
// Handle SIGTERM
57+
sigChan := make(chan os.Signal, 1)
58+
signal.Notify(sigChan, syscall.SIGTERM)
59+
go func() {
60+
<-sigChan
61+
cancel()
62+
}()
63+
64+
// Call the wrapped function
65+
return f(ctx)
66+
}
67+
68+
// Create the app
69+
app := &cli.App{
70+
Name: "compute-domain-daemon",
71+
Usage: "compute-domain-daemon manages the IMEX daemon for NVIDIA compute domains.",
72+
Commands: []*cli.Command{
73+
{
74+
Name: "run",
75+
Usage: "Run the compute domain daemon",
76+
Action: func(c *cli.Context) error {
77+
return wrapper(c.Context, run)
78+
},
79+
},
80+
{
81+
Name: "check",
82+
Usage: "Check if the node is IMEX capable and if the IMEX daemon is ready",
83+
Action: func(c *cli.Context) error {
84+
return wrapper(c.Context, check)
85+
},
86+
},
87+
},
88+
}
89+
90+
return app
91+
}
92+
93+
// run runs the compute domain daemon, checking IMEX capability and managing the IMEX daemon lifecycle.
94+
// It returns an error if any step fails.
95+
func run(ctx context.Context) error {
96+
// Check if node is IMEX capable
97+
capable, err := checkIMEXCapable(ctx)
98+
if err != nil {
99+
return fmt.Errorf("error checking IMEX capability: %w", err)
100+
}
101+
102+
if !capable {
103+
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")
104+
fmt.Println("The IMEX daemon will not be started.")
105+
fmt.Println("Sleeping forever...")
106+
<-ctx.Done()
107+
return nil
108+
}
109+
110+
// Print nodes config
111+
if err := printNodesConfig(ctx); err != nil {
112+
return fmt.Errorf("error printing nodes config: %w", err)
113+
}
114+
115+
// Run IMEX daemon
116+
if err := runIMEXDaemon(ctx, imexConfig); err != nil {
117+
return fmt.Errorf("error running IMEX daemon: %w", err)
118+
}
119+
120+
// Tail the log file
121+
if err := tail(ctx, imexLog); err != nil {
122+
return fmt.Errorf("error tailing log file: %w", err)
123+
}
124+
125+
return nil
126+
}
127+
128+
// check verifies if the node is IMEX capable and if so, checks if the IMEX daemon is ready.
129+
// It returns an error if any step fails.
130+
func check(ctx context.Context) error {
131+
// Check if node is IMEX capable
132+
capable, err := checkIMEXCapable(ctx)
133+
if err != nil {
134+
return fmt.Errorf("error checking IMEX capability: %w", err)
135+
}
136+
137+
if !capable {
138+
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")
139+
return nil
140+
}
141+
142+
// Check if IMEX daemon is ready
143+
cmd := exec.CommandContext(ctx, imexCtl, "-q", "-i", "127.0.0.1", "50005")
144+
output, err := cmd.Output()
145+
if err != nil {
146+
return fmt.Errorf("error checking IMEX daemon status: %w", err)
147+
}
148+
149+
if string(output) != "READY\n" {
150+
return fmt.Errorf("IMEX daemon not ready: %s", string(output))
151+
}
152+
153+
return nil
154+
}
155+
156+
// checkIMEXCapable checks if the node is capable of running IMEX by verifying
157+
// that all GPUs on the node have consistent ClusterUUID and CliqueID values.
158+
// It returns true if the node is IMEX capable, false otherwise.
159+
func checkIMEXCapable(ctx context.Context) (bool, error) {
160+
lib, err := newDeviceLib()
161+
if err != nil {
162+
return false, fmt.Errorf("error creating device library: %w", err)
163+
}
164+
165+
if err := lib.init(); err != nil {
166+
return false, fmt.Errorf("error initializing device library: %w", err)
167+
}
168+
defer lib.alwaysShutdown()
169+
170+
uniqueClusterUUIDs := make(map[string]struct{})
171+
uniqueCliqueIDs := make(map[string]struct{})
172+
173+
err = lib.VisitDevices(func(i int, d nvdev.Device) error {
174+
isFabricAttached, err := d.IsFabricAttached()
175+
if err != nil {
176+
return fmt.Errorf("error checking if device is fabric attached: %w", err)
177+
}
178+
if !isFabricAttached {
179+
return nil
180+
}
181+
182+
info, ret := d.GetGpuFabricInfo()
183+
if ret != nvml.SUCCESS {
184+
return fmt.Errorf("failed to get GPU fabric info: %w", ret)
185+
}
186+
187+
clusterUUID, err := uuid.FromBytes(info.ClusterUuid[:])
188+
if err != nil {
189+
return fmt.Errorf("invalid cluster UUID: %w", err)
190+
}
191+
192+
cliqueID := fmt.Sprintf("%d", info.CliqueId)
193+
194+
uniqueClusterUUIDs[clusterUUID.String()] = struct{}{}
195+
uniqueCliqueIDs[cliqueID] = struct{}{}
196+
197+
return nil
198+
})
199+
if err != nil {
200+
return false, fmt.Errorf("error getting fabric information from one or more devices: %w", err)
201+
}
202+
203+
if len(uniqueClusterUUIDs) == 0 && len(uniqueCliqueIDs) == 0 {
204+
return false, nil
205+
}
206+
207+
if len(uniqueClusterUUIDs) != 1 {
208+
return false, fmt.Errorf("unexpected number of unique ClusterUUIDs found on devices")
209+
}
210+
211+
if len(uniqueCliqueIDs) != 1 {
212+
return false, fmt.Errorf("unexpected number of unique CliqueIDs found on devices")
213+
}
214+
215+
return true, nil
216+
}
217+
218+
// printNodesConfig reads and prints the contents of the nodes configuration file.
219+
// It returns an error if the file cannot be read.
220+
func printNodesConfig(ctx context.Context) error {
221+
fmt.Printf("%s:\n", nodesConfig)
222+
content, err := os.ReadFile(nodesConfig)
223+
if err != nil {
224+
return fmt.Errorf("failed to read nodes config: %w", err)
225+
}
226+
fmt.Println(string(content))
227+
return nil
228+
}
229+
230+
// runIMEXDaemon starts the IMEX daemon with the specified configuration file.
231+
// It returns an error if the daemon fails to start or exits unexpectedly.
232+
func runIMEXDaemon(ctx context.Context, config string) error {
233+
cmd := exec.CommandContext(ctx, imexBinary, "-c", config)
234+
cmd.Stdout = os.Stdout
235+
cmd.Stderr = os.Stderr
236+
return cmd.Run()
237+
}
238+
239+
// tail continuously reads and prints new lines from the specified file using the system's tail command.
240+
// It starts from the beginning of the file (-n +1) and follows new lines (-f).
241+
// It blocks until the context is cancelled or an error occurs.
242+
func tail(ctx context.Context, path string) error {
243+
cmd := exec.CommandContext(ctx, "tail", "-n", "+1", "-f", path)
244+
cmd.Stdout = os.Stdout
245+
cmd.Stderr = os.Stderr
246+
return cmd.Run()
247+
}

cmd/compute-domain-daemon/nvml.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"fmt"
21+
"os"
22+
23+
nvdev "github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
24+
"github.com/NVIDIA/go-nvml/pkg/nvml"
25+
)
26+
27+
// deviceLib wraps the NVIDIA device library interfaces to provide a unified interface
28+
// for interacting with NVIDIA devices and NVML.
29+
type deviceLib struct {
30+
nvdev.Interface
31+
nvmllib nvml.Interface
32+
}
33+
34+
// newDeviceLib creates and initializes a new deviceLib instance.
35+
// It returns an error if the NVML library cannot be initialized.
36+
func newDeviceLib() (*deviceLib, error) {
37+
nvmllib := nvml.New()
38+
d := deviceLib{
39+
Interface: nvdev.New(nvmllib),
40+
nvmllib: nvmllib,
41+
}
42+
return &d, nil
43+
}
44+
45+
// init initializes the NVML library.
46+
// It returns an error if initialization fails.
47+
func (l deviceLib) init() error {
48+
ret := l.nvmllib.Init()
49+
if ret != nvml.SUCCESS {
50+
return fmt.Errorf("error initializing NVML: %v", ret)
51+
}
52+
return nil
53+
}
54+
55+
// alwaysShutdown attempts to shut down the NVML library.
56+
// It logs any errors that occur during shutdown but does not return them,
57+
// as this is typically called in a defer statement.
58+
func (l deviceLib) alwaysShutdown() {
59+
ret := l.nvmllib.Shutdown()
60+
if ret != nvml.SUCCESS {
61+
fmt.Fprintf(os.Stderr, "error shutting down NVML: %v\n", ret)
62+
}
63+
}

deployments/container/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,6 @@ RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-L
8484

8585
COPY --from=build /artifacts/compute-domain-controller /usr/bin/compute-domain-controller
8686
COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
87+
COPY --from=build /artifacts/compute-domain-daemon /usr/bin/compute-domain-daemon
8788
COPY --from=build /artifacts/gpu-kubelet-plugin /usr/bin/gpu-kubelet-plugin
8889
COPY --from=build /build/templates /templates

templates/compute-domain-daemon.tmpl.yaml

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,51 +21,21 @@ spec:
2121
nodeSelector:
2222
{{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }}
2323
containers:
24+
# Run the compute domain daemon
2425
- name: compute-domain-daemon
2526
image: {{ .ImageName }}
26-
command: [sh, -c]
27-
args:
28-
- |-
29-
trap 'exit 0' TERM
30-
set -e
31-
if nvidia-smi -q | grep -E "ClusterUUID|CliqueId" | grep -q "N/A" || \
32-
nvidia-smi -q | grep -E "ClusterUUID" | grep -q "00000000-0000-0000-0000-000000000000"; then
33-
echo "ClusterUUID and CliqueId are NOT set for GPUs on this node."
34-
echo "The IMEX daemon will not be started."
35-
echo "Sleeping forever..."
36-
touch /etc/nvidia-imex-null
37-
tail -f /dev/null & wait
38-
fi
39-
# Emit nodes config for facilitating debug.
40-
echo "/etc/nvidia-imex/nodes_config.cfg:"
41-
cat /etc/nvidia-imex/nodes_config.cfg
42-
/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg
43-
tail -n +1 -f /var/log/nvidia-imex.log & wait
27+
command: ["compute-domain-daemon", "run"]
4428
resources:
4529
claims:
4630
- name: compute-domain-daemon
4731
startupProbe:
4832
exec:
49-
command:
50-
- "sh"
51-
- "-c"
52-
- |-
53-
if [ -f /etc/nvidia-imex-null ]; then
54-
exit 0
55-
fi
56-
test "$(nvidia-imex-ctl -q -i 127.0.0.1 50005)" = "READY"
33+
command: ["compute-domain-daemon", "check"]
5734
initialDelaySeconds: 1
5835
periodSeconds: 1
5936
livenessProbe:
6037
exec:
61-
command:
62-
- "sh"
63-
- "-c"
64-
- |
65-
if [ -f /etc/nvidia-imex-null ]; then
66-
exit 0
67-
fi
68-
test "$(nvidia-imex-ctl -q -i 127.0.0.1 50005)" = "READY"
38+
command: ["compute-domain-daemon", "check"]
6939
initialDelaySeconds: 10
7040
periodSeconds: 5
7141
# Repel all node taints.

0 commit comments

Comments
 (0)