diff --git a/cmd/compute-domain-daemon/controller.go b/cmd/compute-domain-daemon/controller.go index 0906ccc2a..20e14d0b1 100644 --- a/cmd/compute-domain-daemon/controller.go +++ b/cmd/compute-domain-daemon/controller.go @@ -111,9 +111,9 @@ func (c *Controller) Run(ctx context.Context) error { return nil } -// GetNodesUpdateChan() returns a channel that only ever yields a full set of nodes, -// i.e. during startup this blocks until the expected number of nodes is present -// in CD status. +// GetNodesUpdateChan() returns a channel that yields updates for the nodes +// currently present in the CD status. This is only a complete set of nodes +// (size `numNodes`) if IMEXDaemonsWithDNSNames=false. func (c *Controller) GetNodesUpdateChan() chan []*nvapi.ComputeDomainNode { return c.computeDomainManager.GetNodesUpdateChan() } diff --git a/cmd/compute-domain-daemon/dnsnames.go b/cmd/compute-domain-daemon/dnsnames.go index 611c22803..818c4294f 100644 --- a/cmd/compute-domain-daemon/dnsnames.go +++ b/cmd/compute-domain-daemon/dnsnames.go @@ -57,8 +57,10 @@ func NewDNSNameManager(cliqueID string, maxNodesPerIMEXDomain int, nodesConfigPa } } -// UpdateDNSNameMappings updates the /etc/hosts file with any new IP to DNS name mappings. -func (m *DNSNameManager) UpdateDNSNameMappings(nodes []*nvapi.ComputeDomainNode) error { +// UpdateDNSNameMappings updates the /etc/hosts file with any new IP to DNS name +// mappings. The boolean return value indicates whether the hosts file was +// updated or not (it must be ignored when the returned error is non-nil). +func (m *DNSNameManager) UpdateDNSNameMappings(nodes []*nvapi.ComputeDomainNode) (bool, error) { m.Lock() defer m.Unlock() @@ -78,7 +80,7 @@ func (m *DNSNameManager) UpdateDNSNameMappings(nodes []*nvapi.ComputeDomainNode) // Construct the DNS name from the node index dnsName, err := m.constructDNSName(node) if err != nil { - return fmt.Errorf("failed to allocate DNS name for IP %s: %w", node.IPAddress, err) + return false, fmt.Errorf("failed to allocate DNS name for IP %s: %w", node.IPAddress, err) } // Assign the IP -> DNS name mapping @@ -87,14 +89,14 @@ func (m *DNSNameManager) UpdateDNSNameMappings(nodes []*nvapi.ComputeDomainNode) // If the existing ipToDNSName mappings are unchanged, exit early if maps.Equal(ipToDNSName, m.ipToDNSName) { - return nil + return false, nil } // Otherwise, update the cached ipToDNSName mapping m.ipToDNSName = ipToDNSName - // And updated the hosts file with new mappings - return m.updateHostsFile() + // And update the hosts file with the new mapping + return true, m.updateHostsFile() } // LogDNSNameMappings logs the current compute-domain-daemon mappings from memory. diff --git a/cmd/compute-domain-daemon/main.go b/cmd/compute-domain-daemon/main.go index f05b9da4e..687b084ee 100644 --- a/cmd/compute-domain-daemon/main.go +++ b/cmd/compute-domain-daemon/main.go @@ -322,13 +322,37 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithDNSNames") return nil case nodes := <-controller.GetNodesUpdateChan(): - if err := dnsNameManager.UpdateDNSNameMappings(nodes); err != nil { + updated, err := dnsNameManager.UpdateDNSNameMappings(nodes) + if err != nil { return fmt.Errorf("failed to update DNS name => IP mappings: %w", err) } - if err := processManager.EnsureStarted(); err != nil { + + fresh, err := processManager.EnsureStarted() + if err != nil { return fmt.Errorf("failed to ensure IMEX daemon is started: %w", err) } + dnsNameManager.LogDNSNameMappings() + + // Skip sending SIGUSR1 when the process is fresh (has newly been + // created) or when thiss was a noop update. TODO: review skipping + // this also if the new set of IP addresses only strictly removes + // addresses compared to the old set (then we don't need to force + // the daemon to re-resolve & re-connect). + if !updated || fresh { + continue + } + + // Actively ask the IMEX daemon to re-read its config and to + // re-connect to its peers (involving DNS name re-resolution). + klog.Infof("updated DNS/IP mapping, old process: send SIGUSR1") + if err := processManager.Signal(syscall.SIGUSR1); err != nil { + // Only log (ignore this error for now: if the process went away + // unexpectedly, the process manager will handle that. If any + // other error resulted in bad signal delivery, we may get away + // with it). + klog.Errorf("failed to send SIGUSR1 to child process: %s", err) + } } } } diff --git a/cmd/compute-domain-daemon/process.go b/cmd/compute-domain-daemon/process.go index 4a832087e..c0a9ba8c1 100644 --- a/cmd/compute-domain-daemon/process.go +++ b/cmd/compute-domain-daemon/process.go @@ -56,12 +56,27 @@ func (m *ProcessManager) Restart() error { return m.start() } -// EnsureStarted starts the process if it is not already running. If the process is already started, this is a no-op. -func (m *ProcessManager) EnsureStarted() error { +// EnsureStarted starts the process if it is not already running. If the process +// is already started, this is a no-op. The boolean return value indicates +// `new`, i.e. it is `true` if the process was _newly_ started. It must be +// ignored when the returned error is non-nil. +func (m *ProcessManager) EnsureStarted() (bool, error) { if m.handle != nil { - return nil + return false, nil } - return m.start() + return true, m.start() +} + +// Signal() attempts to send the provided signal to the managed child process. +// Any error is emitted to the caller and must be handled there. +func (m *ProcessManager) Signal(s os.Signal) error { + m.Lock() + defer m.Unlock() + + if m.handle == nil { + return fmt.Errorf("pm: sending signal %s failed: not started", s) + } + return m.handle.Process.Signal(s) } func (m *ProcessManager) start() error {