@@ -185,14 +185,6 @@ func newApp() *cli.App {
185185
186186// Run invokes the IMEX daemon and manages its lifecycle.
187187func run (ctx context.Context , cancel context.CancelFunc , flags * Flags ) error {
188- // Support heterogeneous compute domain
189- if flags .cliqueID == "" {
190- fmt .Println ("ClusterUUID and CliqueId are NOT set for GPUs on this node." )
191- fmt .Println ("The IMEX daemon will not be started." )
192- fmt .Println ("Sleeping forever..." )
193- <- ctx .Done ()
194- return nil
195- }
196188
197189 config := & ControllerConfig {
198190 cliqueID : flags .cliqueID ,
@@ -207,7 +199,17 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
207199 }
208200 klog .Infof ("config: %v" , config )
209201
210- // Write the IMEX config with the current pod IP before starting the daemon
202+ // Support heterogeneous ComputeDomains. That means that a CD may contain
203+ // nodes that do not take part in Multi-Node NVLink communication. On such
204+ // nodes, this program is started with an empty NVLink clique ID
205+ // configuration parameter. In this mode, do not start the IMEX daemon but
206+ // otherwise keep business logic intact. In particular, continuously update
207+ // this node's state in the CD object.
208+ if flags .cliqueID == "" {
209+ klog .Infof ("no cliqueID: register with ComputeDomain, but do not run IMEX daemon" )
210+ }
211+
212+ // Render and write the IMEX daemon config with the current pod IP
211213 if err := writeIMEXConfig (flags .podIP ); err != nil {
212214 return fmt .Errorf ("writeIMEXConfig failed: %w" , err )
213215 }
@@ -302,6 +304,11 @@ func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cl
302304 return fmt .Errorf ("writeNodesConfig failed: %w" , err )
303305 }
304306
307+ if cliqueID == "" {
308+ klog .V (1 ).Infof ("empty cliqueID: do not start IMEX daemon" )
309+ break
310+ }
311+
305312 klog .Infof ("Got update, (re)start IMEX daemon" )
306313 if err := pm .Restart (); err != nil {
307314 // This might be a permanent problem, and retrying upon next update
@@ -331,6 +338,11 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
331338 return fmt .Errorf ("failed to update DNS name => IP mappings: %w" , err )
332339 }
333340
341+ if dnsNameManager .cliqueID == "" {
342+ klog .V (1 ).Infof ("empty cliqueID: do not start IMEX daemon" )
343+ break
344+ }
345+
334346 fresh , err := processManager .EnsureStarted ()
335347 if err != nil {
336348 return fmt .Errorf ("failed to ensure IMEX daemon is started: %w" , err )
@@ -344,7 +356,7 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
344356 // addresses compared to the old set (then we don't need to force
345357 // the daemon to re-resolve & re-connect).
346358 if ! updated || fresh {
347- continue
359+ break
348360 }
349361
350362 // Actively ask the IMEX daemon to re-read its config and to
@@ -365,7 +377,7 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
365377// It returns an error if any step fails.
366378func check (ctx context.Context , cancel context.CancelFunc , flags * Flags ) error {
367379 if flags .cliqueID == "" {
368- fmt .Println ("ClusterUUID and CliqueId are NOT set for GPUs on this node. " )
380+ fmt .Println ("check succeeded (noop, clique ID is empty) " )
369381 return nil
370382 }
371383
@@ -405,6 +417,12 @@ func writeIMEXConfig(podIP string) error {
405417 return fmt .Errorf ("error executing template: %w" , err )
406418 }
407419
420+ // Ensure the directory exists
421+ dir := filepath .Dir (imexConfigPath )
422+ if err := os .MkdirAll (dir , 0755 ); err != nil {
423+ return fmt .Errorf ("failed to create directory %s: %w" , dir , err )
424+ }
425+
408426 if err := os .WriteFile (imexConfigPath , configFile .Bytes (), 0644 ); err != nil {
409427 return fmt .Errorf ("error writing config file %v: %w" , imexConfigPath , err )
410428 }
0 commit comments