Skip to content

Commit e734b86

Browse files
PCP-5506: In EC Vertex env, in Maas HCP cluster lxd-initializer pods are in Init:CrashLoopBackOff state (#291) (#292)
(cherry picked from commit 2042ed7) Co-authored-by: Amit Sahastrabuddhe <[email protected]>
1 parent 03a18b4 commit e734b86

File tree

4 files changed

+70
-38
lines changed

4 files changed

+70
-38
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ endif
2626
# Image URL to use all building/pushing image targets
2727
IMAGE_NAME := cluster-api-provider-maas-controller
2828
REGISTRY ?= "us-east1-docker.pkg.dev/spectro-images/dev/${USER}/cluster-api"
29-
SPECTRO_VERSION ?= 4.0.0-dev-29082025
29+
SPECTRO_VERSION ?= 4.8.3-dev-12112025
3030
IMG_TAG ?= v0.6.1-spectro-${SPECTRO_VERSION}
3131
CONTROLLER_IMG ?= ${REGISTRY}/${IMAGE_NAME}
3232

controllers/lxd_initializer_ds.go

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,13 @@ func (r *MaasClusterReconciler) ensureLXDInitializerDS(ctx context.Context, clus
6767
if !r.anyNodeNeedsInitialization(ctx, remoteClient) {
6868
r.Log.Info("All nodes already labeled initialized; considering DS cleanup", "namespace", dsNamespace, "ds", dsName)
6969
if done, err := r.maybeShortCircuitDelete(ctx, remoteClient, dsNamespace, desiredCP, dsName); err != nil {
70+
r.Log.Error(err, "failed to maybe short circuit delete", "namespace", dsNamespace, "ds", dsName)
7071
return err
7172
} else if done {
73+
r.Log.Info("deleted existing initializer DS - all nodes are ready and initialized", "namespace", dsNamespace, "ds", dsName)
7274
return nil
7375
}
76+
r.Log.Info("no nodes need initialization; skipping DS creation", "namespace", dsNamespace, "ds", dsName)
7477
return nil
7578
}
7679

@@ -80,28 +83,68 @@ func (r *MaasClusterReconciler) ensureLXDInitializerDS(ctx context.Context, clus
8083
// }
8184

8285
if err := r.deleteExistingInitializerDS(ctx, remoteClient, dsNamespace); err != nil {
86+
r.Log.Error(err, "failed to delete existing initializer DS", "namespace", dsNamespace, "ds", dsName)
8387
return err
8488
}
8589

8690
// Ensure RBAC resources are created on the target cluster
8791
if err := r.ensureLXDInitializerRBACOnTarget(ctx, remoteClient, dsNamespace); err != nil {
92+
r.Log.Error(err, "failed to ensure LXD initializer RBAC", "namespace", dsNamespace, "ds", dsName)
8893
return fmt.Errorf("failed to ensure LXD initializer RBAC: %v", err)
8994
}
9095

9196
if done, err := r.maybeShortCircuitDelete(ctx, remoteClient, dsNamespace, desiredCP, dsName); err != nil {
97+
r.Log.Error(err, "failed to maybe short circuit delete", "namespace", dsNamespace, "ds", dsName)
9298
return err
9399
} else if done {
100+
r.Log.Info("deleted existing initializer DS - all nodes are ready and initialized", "namespace", dsNamespace, "ds", dsName)
94101
return nil
95102
}
96103

97104
ds, err := r.renderDaemonSetForCluster(clusterScope, dsName, dsNamespace)
98105
if err != nil {
106+
r.Log.Error(err, "failed to render DaemonSet for cluster", "namespace", dsNamespace, "ds", dsName)
99107
return err
100108
}
101109

102-
// Do not set owner refs across clusters; just create/patch on target cluster
103-
_, err = controllerutil.CreateOrPatch(ctx, remoteClient, ds, func() error { return nil })
104-
return err
110+
// Do not set owner refs across clusters; just create/patch on target cluster.
111+
// Mutate existing DaemonSet so changes to template/spec take effect on reconcile.
112+
current := &appsv1.DaemonSet{}
113+
current.Name = dsName
114+
current.Namespace = dsNamespace
115+
116+
_, err = controllerutil.CreateOrPatch(ctx, remoteClient, current, func() error {
117+
// Preserve immutable selector if already present; align labels.
118+
current.Labels = ds.Labels
119+
current.Annotations = ds.Annotations
120+
121+
// Update pod template and mutable spec fields
122+
current.Spec.Template = ds.Spec.Template
123+
current.Spec.UpdateStrategy = ds.Spec.UpdateStrategy
124+
current.Spec.MinReadySeconds = ds.Spec.MinReadySeconds
125+
current.Spec.RevisionHistoryLimit = ds.Spec.RevisionHistoryLimit
126+
127+
// Initialize selector if missing (only valid on create)
128+
if current.Spec.Selector == nil || len(current.Spec.Selector.MatchLabels) == 0 {
129+
current.Spec.Selector = ds.Spec.Selector
130+
}
131+
// Ensure template labels include selector labels
132+
if current.Spec.Selector != nil && len(current.Spec.Selector.MatchLabels) > 0 {
133+
if current.Spec.Template.Labels == nil {
134+
current.Spec.Template.Labels = map[string]string{}
135+
}
136+
for k, v := range current.Spec.Selector.MatchLabels {
137+
current.Spec.Template.Labels[k] = v
138+
}
139+
}
140+
return nil
141+
})
142+
if err != nil {
143+
r.Log.Error(err, "failed to create/patch DaemonSet", "namespace", dsNamespace, "ds", dsName)
144+
return err
145+
}
146+
r.Log.Info("created/patched DaemonSet", "namespace", dsNamespace, "ds", dsName)
147+
return nil
105148
}
106149

107150
// ensureLXDInitializerRBACOnTarget creates the RBAC resources for lxd-initializer on the target cluster
@@ -295,21 +338,18 @@ func (r *MaasClusterReconciler) maybeShortCircuitDelete(ctx context.Context, rem
295338
}
296339
}
297340

298-
// Only delete if:
299-
// 1. We have exactly desiredCP nodes (not more, which would indicate maintenance/new nodes)
300-
// 2. All nodes are Ready
301-
// 3. All nodes are initialized
302-
if int64(len(shortCircuitNodes.Items)) == int64(desiredCP) &&
303-
int64(readyCount) == int64(desiredCP) &&
304-
int64(initCount) >= int64(desiredCP) {
341+
// Delete initializer DS only when ALL nodes (control-plane + worker) are initialized.
342+
// This matches the new requirement to register both CP and worker nodes.
343+
totalNodes := len(shortCircuitNodes.Items)
344+
if totalNodes > 0 && initCount == totalNodes {
305345
shortCircuitDSList := &appsv1.DaemonSetList{}
306346
if err := remoteClient.List(ctx, shortCircuitDSList, client.InNamespace(namespace), client.MatchingLabels{"app": dsName}); err == nil {
307347
for _, ds := range shortCircuitDSList.Items {
308348
_ = remoteClient.Delete(ctx, &ds)
309349
}
310350
}
311-
r.Log.Info("Deleted LXD initializer DaemonSet - all nodes are ready and initialized",
312-
"desiredCP", desiredCP, "totalNodes", len(shortCircuitNodes.Items), "readyNodes", readyCount, "initializedNodes", initCount)
351+
r.Log.Info("Deleted LXD initializer DaemonSet - all nodes initialized",
352+
"desiredCP", desiredCP, "totalNodes", totalNodes, "readyNodes", readyCount, "initializedNodes", initCount)
313353
return true, nil
314354
}
315355
return false, nil

controllers/templates/lxd_initializer_ds.yaml

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,27 +33,24 @@ spec:
3333
- -c
3434
- |
3535
set -ex
36-
if ! command -v lxd >/dev/null 2>&1; then
37-
echo "LXD not present, installing via snap";
38-
apt-get update;
39-
apt-get install -y snapd;
40-
systemctl enable --now snapd.socket;
41-
snap install lxd --channel=5.0/stable;
36+
# Check on the HOST, not in the container
37+
if ! nsenter -t 1 -m -p -- bash -lc 'command -v lxd >/dev/null 2>&1'; then
38+
echo "LXD not present on host, installing snapd and LXD on host";
39+
nsenter -t 1 -m -p -- bash -lc 'export DEBIAN_FRONTEND=noninteractive; apt-get update && apt-get install -y snapd'
40+
# Enable and start snapd on the host
41+
nsenter -t 1 -m -p -- systemctl enable --now snapd.socket
42+
# Install LXD via snap on the host
43+
nsenter -t 1 -m -p -- snap install lxd --channel=5.0/stable
4244
fi
4345
echo "Ensuring LXD daemon is running on host";
44-
if nsenter -t 1 -m -p -- systemctl is-active --quiet snap.lxd.daemon; then
45-
echo "LXD daemon already active";
46-
else
47-
echo "Starting LXD daemon via host systemd";
48-
nsenter -t 1 -m -p -- systemctl start snap.lxd.daemon || snap start --enable lxd.daemon;
49-
fi
46+
# Start/enable via snap (avoid systemd invocation from the pod)
47+
nsenter -t 1 -m -p -- snap start --enable lxd.daemon || true
5048
# Wait for LXD to report readiness (up to 5 minutes)
5149
echo "Waiting for LXD to become ready on host (timeout 5 min)…"
5250
if ! nsenter -t 1 -m -p -- /snap/bin/lxd waitready --timeout 300 ; then
5351
echo "LXD did not become ready after 5 minutes"; exit 1;
5452
fi
5553
echo "Host LXD is ready";
56-
5754
securityContext:
5855
privileged: true
5956
volumeMounts:
@@ -71,7 +68,7 @@ spec:
7168
mountPropagation: HostToContainer
7269
containers:
7370
- name: lxd-initializer
74-
image: us-east1-docker.pkg.dev/spectro-images/dev/jayeshsrivastava/cluster-api/lxd-initializer:v0.6.1-spectro-4.8.2
71+
image: us-east1-docker.pkg.dev/spectro-images/dev/amit/cluster-api/lxd-initializer:v0.6.1-spectro-4.8.3
7572
imagePullPolicy: Always
7673
securityContext:
7774
privileged: true

lxd-initializer/lxd-initializer.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -644,11 +644,11 @@ func logLXDDiagnostics() {
644644
}
645645

646646
// host daemon status
647-
out, err := exec.Command("nsenter", "-t", "1", "-m", "-p", "--", "systemctl", "status", "snap.lxd.daemon").CombinedOutput()
647+
out, err := exec.Command("nsenter", "-t", "1", "-m", "-p", "--", "snap", "services", "lxd").CombinedOutput()
648648
if err == nil {
649-
log.Printf("systemctl status snap.lxd.daemon:\n%s", string(out))
649+
log.Printf("snap services lxd:\n%s", string(out))
650650
} else {
651-
log.Printf("nsenter systemctl status failed: %v", err)
651+
log.Printf("nsenter snap services failed: %v", err)
652652
}
653653

654654
// process list
@@ -974,16 +974,11 @@ func configureLXDNetwork(trustPassword, hostIP string) error {
974974
return err
975975
}
976976
}
977-
// Restart LXD to apply changes
978-
cmd = exec.Command("systemctl", "restart", "snap.lxd.daemon")
977+
// Restart LXD to apply changes (avoid systemd; use snap)
978+
cmd = exec.Command("nsenter", "-t", "1", "-m", "-p", "--", "snap", "restart", "lxd")
979979
output, err = cmd.CombinedOutput()
980980
if err != nil {
981-
log.Printf("systemctl restart inside container failed (%v), trying nsenter fallback", err)
982-
cmd = exec.Command("nsenter", "-t", "1", "-m", "-p", "--", "systemctl", "restart", "snap.lxd.daemon")
983-
output, err = cmd.CombinedOutput()
984-
if err != nil {
985-
return fmt.Errorf("failed to restart LXD (fallback): %s: %w", string(output), err)
986-
}
981+
return fmt.Errorf("failed to restart LXD via snap: %s: %w", string(output), err)
987982
}
988983

989984
log.Printf("LXD configured to listen on %s", address)

0 commit comments

Comments
 (0)