Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions cmd/compute-domain-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ import (
)

const (
nodesConfigPath = "/etc/nvidia-imex/nodes_config.cfg"
imexConfigPath = "/etc/nvidia-imex/config.cfg"
imexConfigTmplPath = "/etc/nvidia-imex/config.tmpl.cfg"
imexBinaryName = "nvidia-imex"
imexCtlBinaryName = "nvidia-imex-ctl"
imexDaemonConfigDirPath = "/imexd"
imexDaemonConfigPath = imexDaemonConfigDirPath + "/imexd.cfg"
imexDaemonConfigTmplPath = imexDaemonConfigDirPath + "/imexd.cfg.tmpl"
imexDaemonNodesConfigPath = imexDaemonConfigDirPath + "/nodes.cfg"
imexDaemonBinaryName = "nvidia-imex"
imexCtlBinaryName = "nvidia-imex-ctl"
)

type Flags struct {
Expand All @@ -60,7 +61,8 @@ type Flags struct {
}

type IMEXConfigTemplateData struct {
IMEXCmdBindInterfaceIP string
IMEXCmdBindInterfaceIP string
IMEXDaemonNodesConfigPath string
}

func main() {
Expand Down Expand Up @@ -218,7 +220,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
var dnsNameManager *DNSNameManager
if featuregates.Enabled(featuregates.IMEXDaemonsWithDNSNames) {
// Prepare DNS name manager
dnsNameManager = NewDNSNameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, nodesConfigPath)
dnsNameManager = NewDNSNameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, imexDaemonNodesConfigPath)

// Create static nodes config file with DNS names
if err := dnsNameManager.WriteNodesConfig(); err != nil {
Expand All @@ -227,7 +229,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
}

// Prepare IMEX daemon process manager.
daemonCommandLine := []string{imexBinaryName, "-c", imexConfigPath}
daemonCommandLine := []string{imexDaemonBinaryName, "-c", imexDaemonConfigPath}
processManager := NewProcessManager(daemonCommandLine)

// Prepare controller with CD manager (not invoking the controller yet).
Expand Down Expand Up @@ -385,7 +387,7 @@ func check(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
// return". This probes if the local IMEX daemon is ready (not the entire
// domain). Reference:
// https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/cmdservice.html
cmd := exec.CommandContext(ctx, imexCtlBinaryName, "-q")
cmd := exec.CommandContext(ctx, imexCtlBinaryName, "-c", imexDaemonConfigPath, "-q")

// Spawn child, collect standard streams.
outerr, err := cmd.CombinedOutput()
Expand All @@ -404,10 +406,11 @@ func check(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
// writeIMEXConfig renders the config template with the pod IP and writes it to the final config file.
func writeIMEXConfig(podIP string) error {
configTemplateData := IMEXConfigTemplateData{
IMEXCmdBindInterfaceIP: podIP,
IMEXCmdBindInterfaceIP: podIP,
IMEXDaemonNodesConfigPath: imexDaemonNodesConfigPath,
}

tmpl, err := template.ParseFiles(imexConfigTmplPath)
tmpl, err := template.ParseFiles(imexDaemonConfigTmplPath)
if err != nil {
return fmt.Errorf("error parsing template file: %w", err)
}
Expand All @@ -418,29 +421,29 @@ func writeIMEXConfig(podIP string) error {
}

// Ensure the directory exists
dir := filepath.Dir(imexConfigPath)
dir := filepath.Dir(imexDaemonConfigPath)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("failed to create directory %s: %w", dir, err)
}

if err := os.WriteFile(imexConfigPath, configFile.Bytes(), 0644); err != nil {
return fmt.Errorf("error writing config file %v: %w", imexConfigPath, err)
if err := os.WriteFile(imexDaemonConfigPath, configFile.Bytes(), 0644); err != nil {
return fmt.Errorf("error writing config file %v: %w", imexDaemonConfigPath, err)
}

klog.Infof("Updated IMEX config file with pod IP: %s", podIP)
klog.Infof("Rendered IMEX daemon config file with: %v", configTemplateData)
return nil
}

// writeNodesConfig creates a nodesConfig file with IPs for nodes in the same clique.
func writeNodesConfig(cliqueID string, nodes []*nvapi.ComputeDomainNode) error {
// Ensure the directory exists
dir := filepath.Dir(nodesConfigPath)
dir := filepath.Dir(imexDaemonNodesConfigPath)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("failed to create directory %s: %w", dir, err)
}

// Create or overwrite the nodesConfig file
f, err := os.Create(nodesConfigPath)
f, err := os.Create(imexDaemonNodesConfigPath)
if err != nil {
return fmt.Errorf("failed to create nodes config file: %w", err)
}
Expand All @@ -467,10 +470,10 @@ func writeNodesConfig(cliqueID string, nodes []*nvapi.ComputeDomainNode) error {
// Read and log the contents of the nodes configuration file. Return an error if
// the file cannot be read.
func logNodesConfig() error {
content, err := os.ReadFile(nodesConfigPath)
content, err := os.ReadFile(imexDaemonNodesConfigPath)
if err != nil {
return fmt.Errorf("failed to read nodes config: %w", err)
}
klog.Infof("Current %s:\n%s", nodesConfigPath, string(content))
klog.Infof("Current %s:\n%s", imexDaemonNodesConfigPath, string(content))
return nil
}
7 changes: 4 additions & 3 deletions cmd/compute-domain-kubelet-plugin/computedomain.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ func (m *ComputeDomainManager) NewSettings(domainID string) *ComputeDomainDaemon
manager: m,
domainID: domainID,
rootDir: fmt.Sprintf("%s/%s", m.configFilesRoot, domainID),
configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "config.tmpl.cfg"),
nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "nodes_config.cfg"),
configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "imexd.cfg.tmpl"),
nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "nodes.cfg"),
}
}

Expand Down Expand Up @@ -176,7 +176,8 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsCommon(ctx context.Con
},
Mounts: []*cdispec.Mount{
{
ContainerPath: "/etc/nvidia-imex",
// imexDaemonConfigDirPath = "/imexd"
ContainerPath: "/imexd",
HostPath: s.rootDir,
Options: []string{"rw", "nosuid", "nodev", "bind"},
},
Expand Down
4 changes: 2 additions & 2 deletions cmd/compute-domain-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,8 +498,8 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config
// Prepare injecting IMEX daemon config files even if IMEX is not supported.
// This for example creates
// '/var/lib/kubelet/plugins/compute-domain.nvidia.com/domains/<uid>' on the
// host which is used as mount source mapped to /etc/nvidia-imex in the CD
// daemon container.
// host which is used as mount source mapped to /imexd in the CD daemon
// container.
if err := computeDomainDaemonSettings.Prepare(ctx); err != nil {
return nil, fmt.Errorf("error preparing ComputeDomain daemon settings for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
Expand Down
2 changes: 1 addition & 1 deletion templates/compute-domain-daemon-config.tmpl.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ SERVER_PORT=50000
# Possible Values:
# Full path/filename string (max length of 256).
# Default Value: /etc/nvidia-imex/nodes_config.cfg
IMEX_NODE_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg
IMEX_NODE_CONFIG_FILE={{ .IMEXDaemonNodesConfigPath }}

# Description: Name of the network interface used for communication.
# OPTIONAL - If empty, network interface will be determined by matching bind IP to
Expand Down