Skip to content

Commit 2d52a14

Browse files
committed
feat: add option to select k8s nodes for topology config
Signed-off-by: Dmitry Shmulevich <[email protected]>
1 parent 00c087f commit 2d52a14

File tree

21 files changed

+248
-120
lines changed

21 files changed

+248
-120
lines changed

charts/topograph/Chart.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.1.0
18+
version: 0.2.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
@@ -25,8 +25,8 @@ appVersion: "1.16.0"
2525

2626
dependencies:
2727
- name: node-data-broker
28-
version: 0.1.0
28+
version: 0.2.0
2929
repository: "file://charts/node-data-broker"
3030
- name: node-observer
31-
version: 0.1.0
31+
version: 0.2.0
3232
repository: "file://charts/node-observer"

charts/topograph/charts/node-data-broker/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.1.0
18+
version: 0.2.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/topograph/charts/node-observer/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.1.0
18+
version: 0.2.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/topograph/values-slinky-block-example.yaml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,20 @@
44

55
global:
66
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
7-
provider: aws
8-
# engine: "k8s" or "slinky"
9-
engine: slinky
10-
engineParams:
11-
namespace: slurm
12-
podSelector:
13-
matchLabels:
14-
app.kubernetes.io/component: compute
15-
plugin: topology/block
16-
block_sizes: 4
17-
topologyConfigPath: topology.conf
18-
topologyConfigmapName: slurm-config
7+
provider:
8+
# name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
9+
name: aws
10+
engine:
11+
name: slinky
12+
params:
13+
namespace: slurm
14+
podSelector:
15+
matchLabels:
16+
app.kubernetes.io/component: compute
17+
plugin: topology/block
18+
block_sizes: 4
19+
topologyConfigPath: topology.conf
20+
topologyConfigmapName: slurm-config
1921

2022
nodeSelector:
2123
dedicated: user-workload

charts/topograph/values-slinky-partition-example.yaml

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,30 @@
44

55
global:
66
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
7-
provider: aws
7+
provider:
8+
name: aws
89
# engine: "k8s" or "slinky"
9-
engine: slinky
10-
engineParams:
11-
namespace: slurm
12-
podSelector:
13-
matchLabels:
14-
app.kubernetes.io/component: compute
15-
topologies:
16-
topo1:
17-
plugin: topology/block
18-
blockSizes: [2,4]
19-
topo2:
20-
plugin: topology/block
21-
blockSizes: [8,16]
22-
topo3:
23-
plugin: topology/tree
24-
topo-default:
25-
plugin: topology/flat
26-
clusterDefault: true
27-
topologyConfigPath: topology.conf
28-
topologyConfigmapName: slurm-config
10+
engine:
11+
name: slinky
12+
params:
13+
namespace: slurm
14+
podSelector:
15+
matchLabels:
16+
app.kubernetes.io/component: compute
17+
topologies:
18+
topo1:
19+
plugin: topology/block
20+
blockSizes: [2,4]
21+
topo2:
22+
plugin: topology/block
23+
blockSizes: [8,16]
24+
topo3:
25+
plugin: topology/tree
26+
topo-default:
27+
plugin: topology/flat
28+
clusterDefault: true
29+
topologyConfigPath: topology.conf
30+
topologyConfigmapName: slurm-config
2931

3032
nodeSelector:
3133
dedicated: user-workload

charts/topograph/values-slinky-tree-example.yaml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,19 @@
44

55
global:
66
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
7-
provider: aws
7+
provider:
8+
name: aws
89
# engine: "k8s" or "slinky"
9-
engine: slinky
10-
engineParams:
11-
namespace: slurm
12-
podSelector:
13-
matchLabels:
14-
app.kubernetes.io/component: compute
15-
plugin: topology/tree
16-
topologyConfigPath: topology.conf
17-
topologyConfigmapName: slurm-config
10+
engine:
11+
name: slinky
12+
params:
13+
namespace: slurm
14+
podSelector:
15+
matchLabels:
16+
app.kubernetes.io/component: compute
17+
plugin: topology/tree
18+
topologyConfigPath: topology.conf
19+
topologyConfigmapName: slurm-config
1820

1921
nodeSelector:
2022
dedicated: user-workload

charts/topograph/values.yaml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
# Declare variables to be passed into your templates.
44

55
global:
6-
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test".
7-
provider: test
8-
# engine: "k8s" or "slinky"
9-
engine: k8s
10-
# engineParams:
6+
provider:
7+
# name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test".
8+
name: test
9+
engine:
10+
# name: "k8s" or "slinky"
11+
name: k8s
1112

1213
service:
1314
type: ClusterIP

internal/k8s/utils.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@ import (
1919
"k8s.io/client-go/tools/remotecommand"
2020
)
2121

22-
func GetNodes(ctx context.Context, client *kubernetes.Clientset) (*corev1.NodeList, error) {
23-
nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
22+
func GetNodes(ctx context.Context, client *kubernetes.Clientset, opt *metav1.ListOptions) (*corev1.NodeList, error) {
23+
if opt == nil {
24+
opt = &metav1.ListOptions{}
25+
}
26+
27+
nodes, err := client.CoreV1().Nodes().List(ctx, *opt)
2428
if err != nil {
2529
return nil, fmt.Errorf("failed to list node in the cluster: %v", err)
2630
}

pkg/engines/k8s/engine.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@ import (
2020
"context"
2121
"net/http"
2222

23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24+
"k8s.io/apimachinery/pkg/labels"
2325
"k8s.io/client-go/kubernetes"
2426
"k8s.io/client-go/rest"
2527

28+
"github.com/NVIDIA/topograph/internal/config"
2629
"github.com/NVIDIA/topograph/internal/httperr"
2730
"github.com/NVIDIA/topograph/pkg/engines"
2831
"github.com/NVIDIA/topograph/pkg/topology"
@@ -33,13 +36,27 @@ const NAME = "k8s"
3336
type K8sEngine struct {
3437
config *rest.Config
3538
client *kubernetes.Clientset
39+
params *Params
40+
}
41+
42+
type Params struct {
43+
// NodeSelector (optional) specifies nodes participating in the topology
44+
NodeSelector map[string]string `mapstructure:"nodeSelector"`
45+
46+
// derived fields
47+
nodeListOpt *metav1.ListOptions
3648
}
3749

3850
func NamedLoader() (string, engines.Loader) {
3951
return NAME, Loader
4052
}
4153

42-
func Loader(_ context.Context, _ engines.Config) (engines.Engine, *httperr.Error) {
54+
func Loader(_ context.Context, params engines.Config) (engines.Engine, *httperr.Error) {
55+
p, err := getParameters(params)
56+
if err != nil {
57+
return nil, httperr.NewError(http.StatusBadRequest, err.Error())
58+
}
59+
4360
config, err := rest.InClusterConfig()
4461
if err != nil {
4562
return nil, httperr.NewError(http.StatusBadGateway, err.Error())
@@ -53,9 +70,25 @@ func Loader(_ context.Context, _ engines.Config) (engines.Engine, *httperr.Error
5370
return &K8sEngine{
5471
config: config,
5572
client: client,
73+
params: p,
5674
}, nil
5775
}
5876

77+
func getParameters(params engines.Config) (*Params, error) {
78+
p := &Params{}
79+
if err := config.Decode(params, p); err != nil {
80+
return nil, err
81+
}
82+
83+
if len(p.NodeSelector) != 0 {
84+
p.nodeListOpt = &metav1.ListOptions{
85+
LabelSelector: labels.Set(p.NodeSelector).String(),
86+
}
87+
}
88+
89+
return p, nil
90+
}
91+
5992
func (eng *K8sEngine) GenerateOutput(ctx context.Context, tree *topology.Vertex, params map[string]any) ([]byte, *httperr.Error) {
6093
if err := NewTopologyLabeler().ApplyNodeLabels(ctx, tree, eng); err != nil {
6194
return nil, httperr.NewError(http.StatusBadGateway, err.Error())

pkg/engines/k8s/kubernetes.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ import (
3232
)
3333

3434
func (eng *K8sEngine) GetComputeInstances(ctx context.Context, _ engines.Environment) ([]topology.ComputeInstances, *httperr.Error) {
35-
nodes, err := k8s.GetNodes(ctx, eng.client)
35+
nodes, err := k8s.GetNodes(ctx, eng.client, eng.params.nodeListOpt)
3636
if err != nil {
3737
return nil, httperr.NewError(http.StatusBadGateway, err.Error())
3838
}

0 commit comments

Comments
 (0)