diff --git a/test/e2e/serial/tests/tolerations.go b/test/e2e/serial/tests/tolerations.go index e5159b2ea..f95eb6e69 100644 --- a/test/e2e/serial/tests/tolerations.go +++ b/test/e2e/serial/tests/tolerations.go @@ -469,10 +469,11 @@ var _ = Describe("[serial][disruptive][rtetols] numaresources RTE tolerations su // NoExecute promises the pod will be evicted "immediately" but the system will still need nonzero time to notice // and the pod will take nonzero time to terminate, so we need a Eventually block. + By(fmt.Sprintf("ensuring the RTE DS is running with less pods because taints (expected pods=%v)", len(workers)-1)) Eventually(func(g Gomega) { + Expect(fxt.Client.Get(ctx, dsKey.AsKey(), updatedDs)).To(Succeed()) pods, err = podlist.With(fxt.Client).ByDaemonset(ctx, *updatedDs) Expect(err).ToNot(HaveOccurred(), "failed to get the daemonset pods %s: %v", dsKey.String(), err) - By(fmt.Sprintf("ensuring the RTE DS is running with less pods because taints (expected pods=%v)", len(workers)-1)) g.Expect(int(updatedDs.Status.NumberReady)).To(Equal(len(workers)-1), "updated DS ready=%v original worker nodes=%v", updatedDs.Status.NumberReady, len(workers)-1) g.Expect(int(updatedDs.Status.NumberReady)).To(Equal(len(pods)), "updated DS ready=%v expected pods", updatedDs.Status.NumberReady, len(pods)) }).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).Should(Succeed()) diff --git a/test/e2e/serial/tests/workload_placement_tmpol.go b/test/e2e/serial/tests/workload_placement_tmpol.go index 72fc647e7..7796bc8ff 100644 --- a/test/e2e/serial/tests/workload_placement_tmpol.go +++ b/test/e2e/serial/tests/workload_placement_tmpol.go @@ -152,10 +152,16 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme Expect(err).ToNot(HaveOccurred(), "missing node load info for %q", nodeName) // TODO: multi-line value in structured log klog.InfoS("computed base load", "value", baseload) - baseload.Apply(paddingRes) + paddingResWithBaseload := paddingRes.DeepCopy() + baseload.Apply(paddingResWithBaseload) + var zonePaddingRes corev1.ResourceList for zIdx, zone := range nrtInfo.Zones { + zonePaddingRes = paddingRes + if zIdx == 0 { + zonePaddingRes = paddingResWithBaseload + } podName := fmt.Sprintf("padding-%d-%d", nIdx, zIdx) - padPod, err := makePaddingPod(fxt.Namespace.Name, podName, zone, paddingRes) + padPod, err := makePaddingPod(fxt.Namespace.Name, podName, zone, zonePaddingRes) Expect(err).NotTo(HaveOccurred(), "unable to create padding pod %q on zone %q", podName, zone.Name) padPod, err = pinPodTo(padPod, nodeName, zone.Name) diff --git a/test/internal/fixture/fixture.go b/test/internal/fixture/fixture.go index 365751436..e6707adce 100644 --- a/test/internal/fixture/fixture.go +++ b/test/internal/fixture/fixture.go @@ -36,6 +36,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/yaml" + machineconfigv1 "github.com/openshift/api/machineconfiguration/v1" + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer" nrtv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2" @@ -102,6 +104,7 @@ func SetupWithOptions(name string, nrtList nrtv1alpha2.NodeResourceTopologyList, randomizeName := (options & OptionRandomizeName) == OptionRandomizeName avoidCooldown := (options & OptionAvoidCooldown) == OptionAvoidCooldown staticClusterData := (options & OptionStaticClusterData) == OptionStaticClusterData + ginkgo.By("set up the test namespace") ns, err := setupNamespace(e2eclient.Client, name, randomizeName) if err != nil { @@ -133,6 +136,44 @@ func SetupWithOptions(name string, nrtList nrtv1alpha2.NodeResourceTopologyList, } klog.Infof("set up the fixture reference NRT List: %s", intnrt.ListToString(nrtList.Items, " fixture initial")) + ginkgo.By("warn about not updated MCPs") + var mcps machineconfigv1.MachineConfigPoolList + err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { + err := e2eclient.Client.List(ctx, &mcps) + return err == nil, nil + }) + if err != nil { + klog.Errorf("failed to pull MCP items: %v", err) + } else { + for _, mcp := range mcps.Items { + conditions := mcp.Status.Conditions + for _, condition := range conditions { + if condition.Type == machineconfigv1.MachineConfigPoolUpdated { + if condition.Status != corev1.ConditionTrue { + klog.Warningf("MCP %q is not updated", mcp.Name) + klog.InfoS("MCP status", "name", mcp.Name, "conditions", conditions) + } + break + } + } + } + } + ginkgo.By("warn about unschedulable nodes") + var nodes corev1.NodeList + err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { + err := e2eclient.Client.List(ctx, &nodes) + return err == nil, nil + }) + if err != nil { + klog.Errorf("failed to pull cluster nodes: %v", err) + } else { + for _, node := range nodes.Items { + if node.Spec.Unschedulable { + klog.Warningf("Node %q is unschedulable", node.Name) + } + } + } + return &Fixture{ Client: e2eclient.Client, K8sClient: e2eclient.K8sClient,