From 7ab1f5d1442d561d26758bc102108c5d631c6635 Mon Sep 17 00:00:00 2001
From: Shereen Haj <shajmakh@redhat.com>
Date: Wed, 8 Oct 2025 12:00:14 +0300
Subject: [PATCH 1/4] e2e: serial: ensure MCPs and nodes are healthy

At the setup of the test, check that MCPs and nodes are healthy enough
for the test to run by verifying that the MCPs are all `Updated` and non of the nodes is
`unschedulable`. We don't want to fail on any error so keep any
unexpected results at warning level.

Signed-off-by: Shereen Haj <shajmakh@redhat.com>
---
 test/internal/fixture/fixture.go | 41 ++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/test/internal/fixture/fixture.go b/test/internal/fixture/fixture.go
index 365751436..e6707adce 100644
--- a/test/internal/fixture/fixture.go
+++ b/test/internal/fixture/fixture.go
@@ -36,6 +36,8 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/yaml"
 
+	machineconfigv1 "github.com/openshift/api/machineconfiguration/v1"
+
 	"github.com/k8stopologyawareschedwg/deployer/pkg/deployer"
 	nrtv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
 
@@ -102,6 +104,7 @@ func SetupWithOptions(name string, nrtList nrtv1alpha2.NodeResourceTopologyList,
 	randomizeName := (options & OptionRandomizeName) == OptionRandomizeName
 	avoidCooldown := (options & OptionAvoidCooldown) == OptionAvoidCooldown
 	staticClusterData := (options & OptionStaticClusterData) == OptionStaticClusterData
+
 	ginkgo.By("set up the test namespace")
 	ns, err := setupNamespace(e2eclient.Client, name, randomizeName)
 	if err != nil {
@@ -133,6 +136,44 @@ func SetupWithOptions(name string, nrtList nrtv1alpha2.NodeResourceTopologyList,
 	}
 	klog.Infof("set up the fixture reference NRT List: %s", intnrt.ListToString(nrtList.Items, " fixture initial"))
 
+	ginkgo.By("warn about not updated MCPs")
+	var mcps machineconfigv1.MachineConfigPoolList
+	err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) {
+		err := e2eclient.Client.List(ctx, &mcps)
+		return err == nil, nil
+	})
+	if err != nil {
+		klog.Errorf("failed to pull MCP items: %v", err)
+	} else {
+		for _, mcp := range mcps.Items {
+			conditions := mcp.Status.Conditions
+			for _, condition := range conditions {
+				if condition.Type == machineconfigv1.MachineConfigPoolUpdated {
+					if condition.Status != corev1.ConditionTrue {
+						klog.Warningf("MCP %q is not updated", mcp.Name)
+						klog.InfoS("MCP status", "name", mcp.Name, "conditions", conditions)
+					}
+					break
+				}
+			}
+		}
+	}
+	ginkgo.By("warn about unschedulable nodes")
+	var nodes corev1.NodeList
+	err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) {
+		err := e2eclient.Client.List(ctx, &nodes)
+		return err == nil, nil
+	})
+	if err != nil {
+		klog.Errorf("failed to pull cluster nodes: %v", err)
+	} else {
+		for _, node := range nodes.Items {
+			if node.Spec.Unschedulable {
+				klog.Warningf("Node %q is unschedulable", node.Name)
+			}
+		}
+	}
+
 	return &Fixture{
 		Client:         e2eclient.Client,
 		K8sClient:      e2eclient.K8sClient,

From 163d01bec7cb1865ddfb507775981e033408fd43 Mon Sep 17 00:00:00 2001
From: Shereen Haj <shajmakh@redhat.com>
Date: Wed, 8 Oct 2025 13:06:37 +0300
Subject: [PATCH 2/4] e2e:serial: fix baseload addition to pod resources

The old method used to apply the baseload on the same `paddingRes` over
and over again. This is not causing issues as long as there are no more
than one node to be padded, which is almost always the case on the MNO
used in the weekly CI.
However this is still incorrect, so this commit fixes it so that the
padding resources with the baseload are cleanly computed for each node.

Signed-off-by: Shereen Haj <shajmakh@redhat.com>
---
 test/e2e/serial/tests/workload_placement_tmpol.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/e2e/serial/tests/workload_placement_tmpol.go b/test/e2e/serial/tests/workload_placement_tmpol.go
index 72fc647e7..25adf4b8c 100644
--- a/test/e2e/serial/tests/workload_placement_tmpol.go
+++ b/test/e2e/serial/tests/workload_placement_tmpol.go
@@ -152,10 +152,11 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
 				Expect(err).ToNot(HaveOccurred(), "missing node load info for %q", nodeName)
 				// TODO: multi-line value in structured log
 				klog.InfoS("computed base load", "value", baseload)
-				baseload.Apply(paddingRes)
+				paddingResWithBaseload := paddingRes.DeepCopy()
+				baseload.Apply(paddingResWithBaseload)
 				for zIdx, zone := range nrtInfo.Zones {
 					podName := fmt.Sprintf("padding-%d-%d", nIdx, zIdx)
-					padPod, err := makePaddingPod(fxt.Namespace.Name, podName, zone, paddingRes)
+					padPod, err := makePaddingPod(fxt.Namespace.Name, podName, zone, paddingResWithBaseload)
 					Expect(err).NotTo(HaveOccurred(), "unable to create padding pod %q on zone %q", podName, zone.Name)
 
 					padPod, err = pinPodTo(padPod, nodeName, zone.Name)

From 6452791cfad497422deb58ba84b510968daf534c Mon Sep 17 00:00:00 2001
From: Shereen Haj <shajmakh@redhat.com>
Date: Wed, 8 Oct 2025 13:34:08 +0300
Subject: [PATCH 3/4] e2e:serial: apply the baseload on the first numa zone

with the old version, the baseload considered in the padding resources
of every numazone. that is incorrect because it can create issues on
tests that expect their testing pods to be scheduled on one node only
while other nodes are unsuitable due to insufficient resources.
This update makes sure that only the forst NUMA zone s padded leaving
extra space for the baseload.

Signed-off-by: Shereen Haj <shajmakh@redhat.com>
---
 test/e2e/serial/tests/workload_placement_tmpol.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/e2e/serial/tests/workload_placement_tmpol.go b/test/e2e/serial/tests/workload_placement_tmpol.go
index 25adf4b8c..7796bc8ff 100644
--- a/test/e2e/serial/tests/workload_placement_tmpol.go
+++ b/test/e2e/serial/tests/workload_placement_tmpol.go
@@ -154,9 +154,14 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
 				klog.InfoS("computed base load", "value", baseload)
 				paddingResWithBaseload := paddingRes.DeepCopy()
 				baseload.Apply(paddingResWithBaseload)
+				var zonePaddingRes corev1.ResourceList
 				for zIdx, zone := range nrtInfo.Zones {
+					zonePaddingRes = paddingRes
+					if zIdx == 0 {
+						zonePaddingRes = paddingResWithBaseload
+					}
 					podName := fmt.Sprintf("padding-%d-%d", nIdx, zIdx)
-					padPod, err := makePaddingPod(fxt.Namespace.Name, podName, zone, paddingResWithBaseload)
+					padPod, err := makePaddingPod(fxt.Namespace.Name, podName, zone, zonePaddingRes)
 					Expect(err).NotTo(HaveOccurred(), "unable to create padding pod %q on zone %q", podName, zone.Name)
 
 					padPod, err = pinPodTo(padPod, nodeName, zone.Name)

From ac941ac8df5f16449853da62251541f36189485a Mon Sep 17 00:00:00 2001
From: Shereen Haj <shajmakh@redhat.com>
Date: Wed, 8 Oct 2025 14:17:05 +0300
Subject: [PATCH 4/4] e2e:serial: pull daemonset object in eventually to ensure
 updates

Sometimes it takes longer time to have the daemonset updates its pods
amounts. The problem with the old code is that in an Eventually it waits
for the daemonset pods number to get updated to the expected number
while using an old daemonset object. To fix this, just pull the
daemonset object in every iteration to ensure the code in the eventaully
is using the most updated data.

Signed-off-by: Shereen Haj <shajmakh@redhat.com>
---
 test/e2e/serial/tests/tolerations.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/e2e/serial/tests/tolerations.go b/test/e2e/serial/tests/tolerations.go
index e5159b2ea..f95eb6e69 100644
--- a/test/e2e/serial/tests/tolerations.go
+++ b/test/e2e/serial/tests/tolerations.go
@@ -469,10 +469,11 @@ var _ = Describe("[serial][disruptive][rtetols] numaresources RTE tolerations su
 
 				// NoExecute promises the pod will be evicted "immediately" but the system will still need nonzero time to notice
 				// and the pod will take nonzero time to terminate, so we need a Eventually block.
+				By(fmt.Sprintf("ensuring the RTE DS is running with less pods because taints (expected pods=%v)", len(workers)-1))
 				Eventually(func(g Gomega) {
+					Expect(fxt.Client.Get(ctx, dsKey.AsKey(), updatedDs)).To(Succeed())
 					pods, err = podlist.With(fxt.Client).ByDaemonset(ctx, *updatedDs)
 					Expect(err).ToNot(HaveOccurred(), "failed to get the daemonset pods %s: %v", dsKey.String(), err)
-					By(fmt.Sprintf("ensuring the RTE DS is running with less pods because taints (expected pods=%v)", len(workers)-1))
 					g.Expect(int(updatedDs.Status.NumberReady)).To(Equal(len(workers)-1), "updated DS ready=%v original worker nodes=%v", updatedDs.Status.NumberReady, len(workers)-1)
 					g.Expect(int(updatedDs.Status.NumberReady)).To(Equal(len(pods)), "updated DS ready=%v expected pods", updatedDs.Status.NumberReady, len(pods))
 				}).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed())