Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions charts/topograph/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
version: 0.2.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand All @@ -25,8 +25,8 @@ appVersion: "1.16.0"

dependencies:
- name: node-data-broker
version: 0.1.0
version: 0.2.0
repository: "file://charts/node-data-broker"
- name: node-observer
version: 0.1.0
version: 0.2.0
repository: "file://charts/node-observer"
2 changes: 1 addition & 1 deletion charts/topograph/charts/node-data-broker/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
version: 0.2.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
command:
- /usr/local/bin/node-data-broker-initc
args:
- -provider={{ .Values.global.provider }}
- -provider={{ .Values.global.provider.name }}
- -v={{ .Values.verbosity }}
env:
- name: NODE_NAME
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rules:
- apiGroups: [""]
resources: [nodes]
verbs: [get,list,update]
{{- if eq .Values.global.provider "infiniband-k8s" }}
{{- if eq .Values.global.provider.name "infiniband-k8s" }}
- apiGroups: [apps]
resources: [daemonsets]
verbs: [get,list]
Expand Down
2 changes: 1 addition & 1 deletion charts/topograph/charts/node-observer/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
version: 0.2.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ metadata:
data:
node-observer-config.yaml: |-
generateTopologyUrl: "{{ include "topograph.url" $ }}/v1/generate"
params:
{{- toYaml .Values.global.engineParams | nindent 6 }}
provider:
{{- toYaml .Values.global.provider | nindent 6 }}
engine:
{{- toYaml .Values.global.engine | nindent 6 }}
trigger:
{{- toYaml .Values.topograph.trigger | nindent 6 }}
2 changes: 0 additions & 2 deletions charts/topograph/templates/configmap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ data:
http:
port: {{ .Values.global.service.port }}
ssl: false
provider: {{ .Values.global.provider }}
engine: {{ .Values.global.engine }}
requestAggregationDelay: {{ .Values.config.requestAggregationDelay }}
{{- if .Values.config.credentialsSecretName }}
credentialsPath: /etc/topograph/credentials/credentials.yaml
Expand Down
2 changes: 1 addition & 1 deletion charts/topograph/templates/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ rules:
- apiGroups: [apps]
resources: [daemonsets]
verbs: [get,list]
{{- if eq .Values.global.engine "slinky" }}
{{- if eq .Values.global.engine.name "slinky" }}
- apiGroups: [""]
resources: [configmaps]
verbs: [create,get,list,update]
Expand Down
32 changes: 19 additions & 13 deletions charts/topograph/values-slinky-block-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@
# Declare variables to be passed into your templates.

global:
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
provider: aws
# engine: "k8s" or "slinky"
engine: slinky
engineParams:
namespace: slurm
podSelector:
matchLabels:
app.kubernetes.io/component: compute
plugin: topology/block
block_sizes: 4
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config
provider:
# name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
name: aws
params:
nodeSelector:
slurmCluster: my-cluster
engine:
name: slinky
params:
namespace: slurm
nodeSelector:
slurmCluster: my-cluster
podSelector:
matchLabels:
app.kubernetes.io/component: compute
plugin: topology/block
block_sizes: 4
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config

nodeSelector:
dedicated: user-workload
Expand Down
47 changes: 24 additions & 23 deletions charts/topograph/values-slinky-partition-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,30 @@
# Declare variables to be passed into your templates.

global:
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
provider: aws
# engine: "k8s" or "slinky"
engine: slinky
engineParams:
namespace: slurm
podSelector:
matchLabels:
app.kubernetes.io/component: compute
topologies:
topo1:
plugin: topology/block
blockSizes: [2,4]
topo2:
plugin: topology/block
blockSizes: [8,16]
topo3:
plugin: topology/tree
topo-default:
plugin: topology/flat
clusterDefault: true
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config
provider:
# name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
name: aws
engine:
name: slinky
params:
namespace: slurm
podSelector:
matchLabels:
app.kubernetes.io/component: compute
topologies:
topo1:
plugin: topology/block
blockSizes: [2,4]
topo2:
plugin: topology/block
blockSizes: [8,16]
topo3:
plugin: topology/tree
topo-default:
plugin: topology/flat
clusterDefault: true
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config

nodeSelector:
dedicated: user-workload
Expand Down
25 changes: 13 additions & 12 deletions charts/topograph/values-slinky-tree-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
# Declare variables to be passed into your templates.

global:
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
provider: aws
# engine: "k8s" or "slinky"
engine: slinky
engineParams:
namespace: slurm
podSelector:
matchLabels:
app.kubernetes.io/component: compute
plugin: topology/tree
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config
provider:
# name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
name: aws
engine:
name: slinky
params:
namespace: slurm
podSelector:
matchLabels:
app.kubernetes.io/component: compute
plugin: topology/tree
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config

nodeSelector:
dedicated: user-workload
Expand Down
11 changes: 6 additions & 5 deletions charts/topograph/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
# Declare variables to be passed into your templates.

global:
# provider: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test".
provider: test
# engine: "k8s" or "slinky"
engine: k8s
# engineParams:
provider:
# name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test".
name: test
engine:
# name: "k8s" or "slinky"
name: k8s

service:
type: ClusterIP
Expand Down
8 changes: 6 additions & 2 deletions internal/k8s/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ import (
"k8s.io/client-go/tools/remotecommand"
)

func GetNodes(ctx context.Context, client *kubernetes.Clientset) (*corev1.NodeList, error) {
nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
func GetNodes(ctx context.Context, client *kubernetes.Clientset, opt *metav1.ListOptions) (*corev1.NodeList, error) {
if opt == nil {
opt = &metav1.ListOptions{}
}

nodes, err := client.CoreV1().Nodes().List(ctx, *opt)
if err != nil {
return nil, fmt.Errorf("failed to list node in the cluster: %v", err)
}
Expand Down
35 changes: 34 additions & 1 deletion pkg/engines/k8s/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ import (
"context"
"net/http"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/NVIDIA/topograph/internal/config"
"github.com/NVIDIA/topograph/internal/httperr"
"github.com/NVIDIA/topograph/pkg/engines"
"github.com/NVIDIA/topograph/pkg/topology"
Expand All @@ -33,13 +36,27 @@ const NAME = "k8s"
type K8sEngine struct {
config *rest.Config
client *kubernetes.Clientset
params *Params
}

type Params struct {
// NodeSelector (optional) specifies nodes participating in the topology
NodeSelector map[string]string `mapstructure:"nodeSelector"`

// derived fields
nodeListOpt *metav1.ListOptions
}

func NamedLoader() (string, engines.Loader) {
return NAME, Loader
}

func Loader(_ context.Context, _ engines.Config) (engines.Engine, *httperr.Error) {
func Loader(_ context.Context, params engines.Config) (engines.Engine, *httperr.Error) {
p, err := getParameters(params)
if err != nil {
return nil, httperr.NewError(http.StatusBadRequest, err.Error())
}

config, err := rest.InClusterConfig()
if err != nil {
return nil, httperr.NewError(http.StatusBadGateway, err.Error())
Expand All @@ -53,9 +70,25 @@ func Loader(_ context.Context, _ engines.Config) (engines.Engine, *httperr.Error
return &K8sEngine{
config: config,
client: client,
params: p,
}, nil
}

func getParameters(params engines.Config) (*Params, error) {
p := &Params{}
if err := config.Decode(params, p); err != nil {
return nil, err
}

if len(p.NodeSelector) != 0 {
p.nodeListOpt = &metav1.ListOptions{
LabelSelector: labels.Set(p.NodeSelector).String(),
}
}

return p, nil
}

func (eng *K8sEngine) GenerateOutput(ctx context.Context, tree *topology.Vertex, params map[string]any) ([]byte, *httperr.Error) {
if err := NewTopologyLabeler().ApplyNodeLabels(ctx, tree, eng); err != nil {
return nil, httperr.NewError(http.StatusBadGateway, err.Error())
Expand Down
55 changes: 55 additions & 0 deletions pkg/engines/k8s/engine_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright 2025 NVIDIA CORPORATION
* SPDX-License-Identifier: Apache-2.0
*/

package k8s

import (
"testing"

"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestGetParameters(t *testing.T) {
testCases := []struct {
name string
params map[string]any
ret *Params
err string
}{
{
name: "Case 1: no params",
params: nil,
ret: &Params{},
},
{
name: "Case 2: bad params",
params: map[string]any{"nodeSelector": .1},
err: "could not decode configuration: 1 error(s) decoding:\n\n* 'nodeSelector' expected a map, got 'float64'",
},
{
name: "Case 3: valid input",
params: map[string]any{"nodeSelector": map[string]string{"key": "val"}},
ret: &Params{
NodeSelector: map[string]string{"key": "val"},
nodeListOpt: &metav1.ListOptions{
LabelSelector: "key=val",
},
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
p, err := getParameters(tc.params)
if len(tc.err) != 0 {
require.ErrorContains(t, err, tc.err)
} else {
require.NoError(t, err)
require.Equal(t, tc.ret, p)
}
})
}
}
2 changes: 1 addition & 1 deletion pkg/engines/k8s/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (
)

func (eng *K8sEngine) GetComputeInstances(ctx context.Context, _ engines.Environment) ([]topology.ComputeInstances, *httperr.Error) {
nodes, err := k8s.GetNodes(ctx, eng.client)
nodes, err := k8s.GetNodes(ctx, eng.client, eng.params.nodeListOpt)
if err != nil {
return nil, httperr.NewError(http.StatusBadGateway, err.Error())
}
Expand Down
Loading