Skip to content

Commit 8680423

Browse files
authored
add DRA provider (#159)
Signed-off-by: Dmitry Shmulevich <[email protected]>
1 parent 62db786 commit 8680423

File tree

8 files changed

+120
-17
lines changed

8 files changed

+120
-17
lines changed

Dockerfile

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
# syntax=docker/dockerfile:1
2-
3-
FROM --platform=${TARGETOS}/${TARGETARCH} golang:1.23.3 AS builder
1+
FROM golang:1.23.3 AS builder
42

53
WORKDIR /go/src/github.com/NVIDIA/topograph
64
COPY . .
@@ -10,6 +8,6 @@ ARG TARGETARCH
108

119
RUN make build-${TARGETOS}-${TARGETARCH}
1210

13-
FROM --platform=${TARGETOS}/${TARGETARCH} gcr.io/distroless/static-debian11:nonroot
11+
FROM gcr.io/distroless/static-debian11:nonroot
1412

1513
COPY --from=builder /go/src/github.com/NVIDIA/topograph/bin/* /usr/local/bin/

Makefile

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ LINTER_BIN ?= golangci-lint
1616
DOCKER_BIN ?= docker
1717
GOOS ?= $(shell uname | tr '[:upper:]' '[:lower:]')
1818
GOARCH ?= $(shell arch | sed 's/x86_64/amd64/')
19+
PLATFORMS ?= linux/arm64,linux/amd64
1920
TARGETS := topograph node-observer node-data-broker-initc
2021
CMD_DIR := ./cmd
2122
OUTPUT_DIR := ./bin
@@ -89,12 +90,12 @@ image-build:
8990
image-push: image-build
9091
$(DOCKER_BIN) push $(IMAGE_REPO):$(IMAGE_TAG)
9192

92-
.PHONY: docker
93-
docker:
94-
docker buildx create --use --name=crossplat --node=crossplat || true
95-
docker buildx build \
96-
--platform linux/amd64,linux/arm64 \
97-
-t $(IMAGE_REPO):$(IMAGE_TAG) -f ./Dockerfile --push .
93+
.PHONY: docker-buildx
94+
docker-buildx:
95+
- $(DOCKER_BIN) buildx create --name=topograph-builder
96+
$(DOCKER_BIN) buildx use topograph-builder
97+
$(DOCKER_BIN) buildx build --platform $(PLATFORMS) -t $(IMAGE_REPO):$(IMAGE_TAG) -f ./Dockerfile --push .
98+
- $(DOCKER_BIN) buildx rm topograph-builder
9899

99100
.PHONY: ssl
100101
ssl:

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ http:
6161
ssl: false
6262

6363
# provider: the provider that topograph will use (optional)
64-
# Valid options include "aws", "oci", "gcp", "nebius", "netq", "baremetal" or "test".
64+
# Valid options include "aws", "oci", "gcp", "nebius", "netq", "dra", "baremetal" or "test".
6565
# Can be overridden if the provider is specified in a topology request to topograph
6666
provider: test
6767

@@ -110,6 +110,8 @@ Currently supported providers:
110110
- OCI
111111
- GCP
112112
- Nebius
113+
- NetQ
114+
- DRA
113115
- Bare metal
114116

115117
Currently supported engines:
@@ -133,7 +135,7 @@ Topograph offers three endpoints for interacting with the service. Below are the
133135
- **Description:** This endpoint is used to request a new cluster topology.
134136
- **Payload:** The payload is a JSON object that includes the following fields:
135137

136-
- **provider name**: (optional) A string specifying the Service Provider, such as `aws`, `oci`, `gcp`, `nebius`, `netq`, `baremetal` or `test`. This parameter will be override the provider set in the topograph config.
138+
- **provider name**: (optional) A string specifying the Service Provider, such as `aws`, `oci`, `gcp`, `nebius`, `netq`, `dra`, `baremetal` or `test`. This parameter will be override the provider set in the topograph config.
137139
- **provider credentials**: (optional) A key-value map with provider-specific parameters for authentication.
138140
- **provider parameters**: (optional) A key-value map with parameters that are used for provider simulation with toposim.
139141
- **model_path**: (optional) A string parameter that points to the model file to use for simulating topology.

charts/topograph/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Declare variables to be passed into your templates.
44

55
global:
6-
# provider: "aws", "oci", "gcp", "nebius", "netq", "baremetal" or "test".
6+
# provider: "aws", "oci", "gcp", "nebius", "netq", "baremetal", "dra" or "test".
77
provider: test
88
# engine: "k8s" or "slinky"
99
engine: k8s

cmd/node-data-broker-initc/main.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"k8s.io/klog/v2"
3131

3232
"github.com/NVIDIA/topograph/pkg/providers/aws"
33+
"github.com/NVIDIA/topograph/pkg/providers/dra"
3334
"github.com/NVIDIA/topograph/pkg/providers/gcp"
3435
"github.com/NVIDIA/topograph/pkg/providers/nebius"
3536
"github.com/NVIDIA/topograph/pkg/providers/oci"
@@ -58,7 +59,7 @@ func main() {
5859
}
5960
}
6061

61-
func mainInternal(provider string) (err error) {
62+
func mainInternal(provider string) error {
6263
ctx := context.TODO()
6364
nodeName := os.Getenv("NODE_NAME")
6465

@@ -72,7 +73,7 @@ func mainInternal(provider string) (err error) {
7273
return fmt.Errorf("failed to create clientset: %v", err)
7374
}
7475

75-
annotations, err := getAnnotations(ctx, provider)
76+
annotations, err := getAnnotations(ctx, provider, nodeName)
7677
if err != nil {
7778
return err
7879
}
@@ -93,7 +94,7 @@ func mainInternal(provider string) (err error) {
9394
return nil
9495
}
9596

96-
func getAnnotations(ctx context.Context, provider string) (map[string]string, error) {
97+
func getAnnotations(ctx context.Context, provider, nodeName string) (map[string]string, error) {
9798
switch provider {
9899
case aws.NAME:
99100
return aws.GetNodeAnnotations(ctx)
@@ -103,6 +104,8 @@ func getAnnotations(ctx context.Context, provider string) (map[string]string, er
103104
return oci.GetNodeAnnotations(ctx)
104105
case nebius.NAME:
105106
return nebius.GetNodeAnnotations(ctx)
107+
case dra.NAME:
108+
return dra.GetNodeAnnotations(ctx, nodeName)
106109
case "":
107110
return nil, fmt.Errorf("must set provider")
108111
default:

cmd/node-data-broker-initc/main_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func TestGetAnnotations(t *testing.T) {
4444

4545
for _, tc := range testCases {
4646
t.Run(tc.name, func(t *testing.T) {
47-
_, err := getAnnotations(context.TODO(), tc.provider)
47+
_, err := getAnnotations(context.TODO(), tc.provider, "")
4848
require.EqualError(t, err, tc.err)
4949
})
5050
}

pkg/providers/dra/provider.go

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* Copyright 2025 NVIDIA CORPORATION
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package dra
7+
8+
import (
9+
"context"
10+
11+
"k8s.io/client-go/kubernetes"
12+
"k8s.io/client-go/rest"
13+
14+
"github.com/NVIDIA/topograph/internal/k8s"
15+
"github.com/NVIDIA/topograph/pkg/providers"
16+
"github.com/NVIDIA/topograph/pkg/topology"
17+
)
18+
19+
const NAME = "dra"
20+
21+
type Provider struct {
22+
config *rest.Config
23+
client *kubernetes.Clientset
24+
}
25+
26+
func NamedLoader() (string, providers.Loader) {
27+
return NAME, Loader
28+
}
29+
30+
func Loader(ctx context.Context, config providers.Config) (providers.Provider, error) {
31+
cfg, err := rest.InClusterConfig()
32+
if err != nil {
33+
return nil, err
34+
}
35+
36+
client, err := kubernetes.NewForConfig(cfg)
37+
if err != nil {
38+
return nil, err
39+
}
40+
41+
return &Provider{
42+
config: cfg,
43+
client: client,
44+
}, nil
45+
}
46+
47+
func (p *Provider) GenerateTopologyConfig(ctx context.Context, _ *int, instances []topology.ComputeInstances) (*topology.Vertex, error) {
48+
regIndices := make(map[string]int) // map[region : index]
49+
for i, ci := range instances {
50+
regIndices[ci.Region] = i
51+
}
52+
53+
nodes, err := k8s.GetNodes(ctx, p.client)
54+
if err != nil {
55+
return nil, err
56+
}
57+
58+
domainMap := topology.NewDomainMap()
59+
for _, node := range nodes.Items {
60+
clusterID, ok := node.Labels["nvidia.com/gpu.clique"]
61+
if !ok {
62+
continue
63+
}
64+
65+
region := node.Annotations[topology.KeyNodeRegion]
66+
indx, ok := regIndices[region]
67+
if !ok {
68+
continue
69+
}
70+
71+
i2n := instances[indx].Instances
72+
if host, ok := i2n[node.Name]; ok {
73+
domainMap.AddHost(clusterID, node.Name, host)
74+
}
75+
}
76+
77+
return toGraph(domainMap), nil
78+
}
79+
80+
func toGraph(domainMap topology.DomainMap) *topology.Vertex {
81+
root := &topology.Vertex{
82+
Vertices: make(map[string]*topology.Vertex),
83+
Metadata: make(map[string]string),
84+
}
85+
root.Vertices[topology.TopologyBlock] = domainMap.ToBlocks()
86+
87+
return root
88+
}
89+
90+
func GetNodeAnnotations(ctx context.Context, hostname string) (map[string]string, error) {
91+
annotations := map[string]string{
92+
topology.KeyNodeInstance: hostname,
93+
topology.KeyNodeRegion: "local",
94+
}
95+
96+
return annotations, nil
97+
}

pkg/registry/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/NVIDIA/topograph/pkg/providers/aws"
2727
"github.com/NVIDIA/topograph/pkg/providers/baremetal"
2828
"github.com/NVIDIA/topograph/pkg/providers/cw"
29+
"github.com/NVIDIA/topograph/pkg/providers/dra"
2930
"github.com/NVIDIA/topograph/pkg/providers/gcp"
3031
"github.com/NVIDIA/topograph/pkg/providers/nebius"
3132
"github.com/NVIDIA/topograph/pkg/providers/netq"
@@ -38,6 +39,7 @@ var Providers = providers.NewRegistry(
3839
aws.NamedLoaderSim,
3940
baremetal.NamedLoader,
4041
cw.NamedLoader,
42+
dra.NamedLoader,
4143
gcp.NamedLoader,
4244
gcp.NamedLoaderSim,
4345
oci.NamedLoaderAPI,

0 commit comments

Comments
 (0)