diff --git a/cmd/compute-domain-daemon/main.go b/cmd/compute-domain-daemon/main.go index 4c730d186..6687b5780 100644 --- a/cmd/compute-domain-daemon/main.go +++ b/cmd/compute-domain-daemon/main.go @@ -38,11 +38,12 @@ import ( ) const ( - nodesConfigPath = "/etc/nvidia-imex/nodes_config.cfg" - imexConfigPath = "/etc/nvidia-imex/config.cfg" - imexConfigTmplPath = "/etc/nvidia-imex/config.tmpl.cfg" - imexBinaryName = "nvidia-imex" - imexCtlBinaryName = "nvidia-imex-ctl" + imexDaemonConfigDirPath = "/imexd" + imexDaemonConfigPath = imexDaemonConfigDirPath + "/imexd.cfg" + imexDaemonConfigTmplPath = imexDaemonConfigDirPath + "/imexd.cfg.tmpl" + imexDaemonNodesConfigPath = imexDaemonConfigDirPath + "/nodes.cfg" + imexDaemonBinaryName = "nvidia-imex" + imexCtlBinaryName = "nvidia-imex-ctl" ) type Flags struct { @@ -60,7 +61,8 @@ type Flags struct { } type IMEXConfigTemplateData struct { - IMEXCmdBindInterfaceIP string + IMEXCmdBindInterfaceIP string + IMEXDaemonNodesConfigPath string } func main() { @@ -218,7 +220,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { var dnsNameManager *DNSNameManager if featuregates.Enabled(featuregates.IMEXDaemonsWithDNSNames) { // Prepare DNS name manager - dnsNameManager = NewDNSNameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, nodesConfigPath) + dnsNameManager = NewDNSNameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, imexDaemonNodesConfigPath) // Create static nodes config file with DNS names if err := dnsNameManager.WriteNodesConfig(); err != nil { @@ -227,7 +229,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { } // Prepare IMEX daemon process manager. - daemonCommandLine := []string{imexBinaryName, "-c", imexConfigPath} + daemonCommandLine := []string{imexDaemonBinaryName, "-c", imexDaemonConfigPath} processManager := NewProcessManager(daemonCommandLine) // Prepare controller with CD manager (not invoking the controller yet). @@ -385,7 +387,7 @@ func check(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { // return". This probes if the local IMEX daemon is ready (not the entire // domain). Reference: // https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/cmdservice.html - cmd := exec.CommandContext(ctx, imexCtlBinaryName, "-q") + cmd := exec.CommandContext(ctx, imexCtlBinaryName, "-c", imexDaemonConfigPath, "-q") // Spawn child, collect standard streams. outerr, err := cmd.CombinedOutput() @@ -404,10 +406,11 @@ func check(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { // writeIMEXConfig renders the config template with the pod IP and writes it to the final config file. func writeIMEXConfig(podIP string) error { configTemplateData := IMEXConfigTemplateData{ - IMEXCmdBindInterfaceIP: podIP, + IMEXCmdBindInterfaceIP: podIP, + IMEXDaemonNodesConfigPath: imexDaemonNodesConfigPath, } - tmpl, err := template.ParseFiles(imexConfigTmplPath) + tmpl, err := template.ParseFiles(imexDaemonConfigTmplPath) if err != nil { return fmt.Errorf("error parsing template file: %w", err) } @@ -418,29 +421,29 @@ func writeIMEXConfig(podIP string) error { } // Ensure the directory exists - dir := filepath.Dir(imexConfigPath) + dir := filepath.Dir(imexDaemonConfigPath) if err := os.MkdirAll(dir, 0755); err != nil { return fmt.Errorf("failed to create directory %s: %w", dir, err) } - if err := os.WriteFile(imexConfigPath, configFile.Bytes(), 0644); err != nil { - return fmt.Errorf("error writing config file %v: %w", imexConfigPath, err) + if err := os.WriteFile(imexDaemonConfigPath, configFile.Bytes(), 0644); err != nil { + return fmt.Errorf("error writing config file %v: %w", imexDaemonConfigPath, err) } - klog.Infof("Updated IMEX config file with pod IP: %s", podIP) + klog.Infof("Rendered IMEX daemon config file with: %v", configTemplateData) return nil } // writeNodesConfig creates a nodesConfig file with IPs for nodes in the same clique. func writeNodesConfig(cliqueID string, nodes []*nvapi.ComputeDomainNode) error { // Ensure the directory exists - dir := filepath.Dir(nodesConfigPath) + dir := filepath.Dir(imexDaemonNodesConfigPath) if err := os.MkdirAll(dir, 0755); err != nil { return fmt.Errorf("failed to create directory %s: %w", dir, err) } // Create or overwrite the nodesConfig file - f, err := os.Create(nodesConfigPath) + f, err := os.Create(imexDaemonNodesConfigPath) if err != nil { return fmt.Errorf("failed to create nodes config file: %w", err) } @@ -467,10 +470,10 @@ func writeNodesConfig(cliqueID string, nodes []*nvapi.ComputeDomainNode) error { // Read and log the contents of the nodes configuration file. Return an error if // the file cannot be read. func logNodesConfig() error { - content, err := os.ReadFile(nodesConfigPath) + content, err := os.ReadFile(imexDaemonNodesConfigPath) if err != nil { return fmt.Errorf("failed to read nodes config: %w", err) } - klog.Infof("Current %s:\n%s", nodesConfigPath, string(content)) + klog.Infof("Current %s:\n%s", imexDaemonNodesConfigPath, string(content)) return nil } diff --git a/cmd/compute-domain-kubelet-plugin/computedomain.go b/cmd/compute-domain-kubelet-plugin/computedomain.go index 5ea94db1e..235fcde81 100644 --- a/cmd/compute-domain-kubelet-plugin/computedomain.go +++ b/cmd/compute-domain-kubelet-plugin/computedomain.go @@ -133,8 +133,8 @@ func (m *ComputeDomainManager) NewSettings(domainID string) *ComputeDomainDaemon manager: m, domainID: domainID, rootDir: fmt.Sprintf("%s/%s", m.configFilesRoot, domainID), - configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "config.tmpl.cfg"), - nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "nodes_config.cfg"), + configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "imexd.cfg.tmpl"), + nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "nodes.cfg"), } } @@ -176,7 +176,8 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsCommon(ctx context.Con }, Mounts: []*cdispec.Mount{ { - ContainerPath: "/etc/nvidia-imex", + // imexDaemonConfigDirPath = "/imexd" + ContainerPath: "/imexd", HostPath: s.rootDir, Options: []string{"rw", "nosuid", "nodev", "bind"}, }, diff --git a/cmd/compute-domain-kubelet-plugin/device_state.go b/cmd/compute-domain-kubelet-plugin/device_state.go index a486e0fbf..2707c9aea 100644 --- a/cmd/compute-domain-kubelet-plugin/device_state.go +++ b/cmd/compute-domain-kubelet-plugin/device_state.go @@ -498,8 +498,8 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config // Prepare injecting IMEX daemon config files even if IMEX is not supported. // This for example creates // '/var/lib/kubelet/plugins/compute-domain.nvidia.com/domains/' on the - // host which is used as mount source mapped to /etc/nvidia-imex in the CD - // daemon container. + // host which is used as mount source mapped to /imexd in the CD daemon + // container. if err := computeDomainDaemonSettings.Prepare(ctx); err != nil { return nil, fmt.Errorf("error preparing ComputeDomain daemon settings for requests '%v' in claim '%v': %w", requests, claim.UID, err) } diff --git a/templates/compute-domain-daemon-config.tmpl.cfg b/templates/compute-domain-daemon-config.tmpl.cfg index 2e3a4c07c..00b28a3ca 100644 --- a/templates/compute-domain-daemon-config.tmpl.cfg +++ b/templates/compute-domain-daemon-config.tmpl.cfg @@ -87,7 +87,7 @@ SERVER_PORT=50000 # Possible Values: # Full path/filename string (max length of 256). # Default Value: /etc/nvidia-imex/nodes_config.cfg -IMEX_NODE_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg +IMEX_NODE_CONFIG_FILE={{ .IMEXDaemonNodesConfigPath }} # Description: Name of the network interface used for communication. # OPTIONAL - If empty, network interface will be determined by matching bind IP to