Skip to content

Commit 00c087f

Browse files
authored
fix: in the status observer, trigger topology discovery when pods are ready (#183)
Signed-off-by: Dmitry Shmulevich <[email protected]>
1 parent 0663765 commit 00c087f

File tree

7 files changed

+299
-141
lines changed

7 files changed

+299
-141
lines changed

internal/k8s/utils.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@ func GetPodsByLabels(ctx context.Context, client *kubernetes.Clientset, namespac
3333
return client.CoreV1().Pods(namespace).List(ctx, opt)
3434
}
3535

36+
func IsPodReady(pod *corev1.Pod) bool {
37+
for _, cond := range pod.Status.Conditions {
38+
if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
39+
return true
40+
}
41+
}
42+
return false
43+
}
44+
3645
func GetDaemonSetPods(ctx context.Context, client *kubernetes.Clientset, name, namespace, nodename string) (*corev1.PodList, error) {
3746
ds, err := client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{})
3847
if err != nil {

internal/k8s/utils_test.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright 2025 NVIDIA CORPORATION
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package k8s
7+
8+
import (
9+
"testing"
10+
11+
"github.com/stretchr/testify/require"
12+
corev1 "k8s.io/api/core/v1"
13+
)
14+
15+
func TestIsPodReady(t *testing.T) {
16+
testCases := []struct {
17+
name string
18+
pod *corev1.Pod
19+
ready bool
20+
}{
21+
{
22+
name: "Case 1: ready",
23+
pod: &corev1.Pod{
24+
Status: corev1.PodStatus{
25+
Conditions: []corev1.PodCondition{
26+
{
27+
Type: corev1.DisruptionTarget,
28+
Status: corev1.ConditionUnknown,
29+
},
30+
{
31+
Type: corev1.PodReady,
32+
Status: corev1.ConditionTrue,
33+
},
34+
},
35+
},
36+
},
37+
ready: true,
38+
},
39+
{
40+
name: "Case 2: implicit not ready",
41+
pod: &corev1.Pod{
42+
Status: corev1.PodStatus{
43+
Conditions: []corev1.PodCondition{
44+
{
45+
Type: corev1.DisruptionTarget,
46+
Status: corev1.ConditionUnknown,
47+
},
48+
{
49+
Type: corev1.ContainersReady,
50+
Status: corev1.ConditionTrue,
51+
},
52+
},
53+
},
54+
},
55+
ready: false,
56+
},
57+
{
58+
name: "Case 3: explicit not ready",
59+
pod: &corev1.Pod{
60+
Status: corev1.PodStatus{
61+
Conditions: []corev1.PodCondition{
62+
{
63+
Type: corev1.DisruptionTarget,
64+
Status: corev1.ConditionUnknown,
65+
},
66+
{
67+
Type: corev1.PodReady,
68+
Status: corev1.ConditionFalse,
69+
},
70+
},
71+
},
72+
},
73+
ready: false,
74+
},
75+
}
76+
77+
for _, tc := range testCases {
78+
t.Run(tc.name, func(t *testing.T) {
79+
require.Equal(t, tc.ready, IsPodReady(tc.pod))
80+
})
81+
}
82+
}

pkg/engines/slinky/engine.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,9 @@ func (eng *SlinkyEngine) GetComputeInstances(ctx context.Context, _ engines.Envi
133133
// map k8s host name to SLURM host name
134134
nodeMap := make(map[string]string)
135135
for _, pod := range pods.Items {
136+
if !k8s.IsPodReady(&pod) {
137+
continue
138+
}
136139
host, ok := pod.Labels["slurm.node.name"]
137140
if !ok {
138141
host = pod.Spec.Hostname

pkg/node_observer/controller.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ import (
3131
)
3232

3333
type Controller struct {
34-
ctx context.Context
35-
client kubernetes.Interface
36-
nodeInformer *NodeInformer
34+
ctx context.Context
35+
client kubernetes.Interface
36+
statusInformer *StatusInformer
3737
}
3838

3939
func NewController(ctx context.Context, client kubernetes.Interface, cfg *Config) (*Controller, error) {
@@ -50,23 +50,23 @@ func NewController(ctx context.Context, client kubernetes.Interface, cfg *Config
5050
req.Header.Set("Content-Type", "application/json")
5151
return req, nil
5252
}
53-
nodeInformer, err := NewNodeInformer(ctx, client, &cfg.Trigger, f)
53+
statusInformer, err := NewStatusInformer(ctx, client, &cfg.Trigger, f)
5454
if err != nil {
5555
return nil, err
5656
}
5757
return &Controller{
58-
ctx: ctx,
59-
client: client,
60-
nodeInformer: nodeInformer,
58+
ctx: ctx,
59+
client: client,
60+
statusInformer: statusInformer,
6161
}, nil
6262
}
6363

6464
func (c *Controller) Start() error {
6565
klog.Infof("Starting state observer")
66-
return c.nodeInformer.Start()
66+
return c.statusInformer.Start()
6767
}
6868

6969
func (c *Controller) Stop(err error) {
7070
klog.Infof("Stopping state observer")
71-
c.nodeInformer.Stop(err)
71+
c.statusInformer.Stop(err)
7272
}

pkg/node_observer/node_informer.go

Lines changed: 0 additions & 127 deletions
This file was deleted.

0 commit comments

Comments
 (0)