Skip to content

Commit 99e62d4

Browse files
committed
feat: add support for NVLink domain discovery in NetQ provider
Signed-off-by: Dmitry Shmulevich <[email protected]>
1 parent 0663765 commit 99e62d4

File tree

6 files changed

+210
-10
lines changed

6 files changed

+210
-10
lines changed

pkg/providers/netq/netq.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import (
2323
const (
2424
LoginURL = "auth/v1/login"
2525
OpIdURL = "auth/v1/select/opid"
26-
TopologyURL = "telemetry/v1/object/topologygraph/fetch-topology"
26+
TopologyURL = "api/netq/telemetry/v1/object/topologygraph/fetch-topology"
2727
)
2828

2929
type NetqResponse struct {
@@ -49,7 +49,7 @@ type AuthOutput struct {
4949
AccessToken string `json:"access_token"`
5050
}
5151

52-
func (p *Provider) generateTopologyConfig(ctx context.Context, cis []topology.ComputeInstances) (*topology.Vertex, *httperr.Error) {
52+
func (p *Provider) getNetworkTree(ctx context.Context, cis []topology.ComputeInstances) (*topology.Vertex, *httperr.Error) {
5353
// 1. login to NetQ server
5454
payload := strings.NewReader(fmt.Sprintf(`{"username":%q, "password":%q}`, p.cred.user, p.cred.passwd))
5555
headers := map[string]string{
@@ -242,9 +242,5 @@ func parseNetq(resp []NetqResponse, inputNodes map[string]bool) (*topology.Verte
242242
treeRoot.Vertices[node.ID] = node
243243
}
244244

245-
root := &topology.Vertex{
246-
Vertices: map[string]*topology.Vertex{topology.TopologyTree: treeRoot},
247-
}
248-
249-
return root, nil
245+
return treeRoot, nil
250246
}

pkg/providers/netq/netq_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,11 @@ func TestParseNetq(t *testing.T) {
104104
Nodes: nodes,
105105
}}
106106

107-
root, err := parseNetq(netqResponse, map[string]bool{"A": true})
107+
treeRoot, err := parseNetq(netqResponse, map[string]bool{"A": true})
108108
require.Nil(t, err)
109109

110110
top := []*topology.Vertex{}
111-
for _, v := range root.Vertices[topology.TopologyTree].Vertices {
111+
for _, v := range treeRoot.Vertices {
112112
top = append(top, v)
113113
}
114114

pkg/providers/netq/nmx.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Copyright 2025 NVIDIA CORPORATION
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package netq
7+
8+
import (
9+
"context"
10+
"encoding/base64"
11+
"encoding/json"
12+
"fmt"
13+
"net/http"
14+
15+
"k8s.io/klog/v2"
16+
17+
"github.com/NVIDIA/topograph/internal/httperr"
18+
"github.com/NVIDIA/topograph/internal/httpreq"
19+
"github.com/NVIDIA/topograph/pkg/topology"
20+
)
21+
22+
const (
23+
ComputeURL = "nmx/v1/compute-nodes"
24+
)
25+
26+
type ComputeNode struct {
27+
Id string `json:"ID"`
28+
Name string `json:"Name"`
29+
DomainUUID string `json:"DomainUUID"`
30+
}
31+
32+
func (p *Provider) getNvlDomains(ctx context.Context) (topology.DomainMap, *httperr.Error) {
33+
url, headers, httpErr := p.getComputeUrl()
34+
if httpErr != nil {
35+
return nil, httpErr
36+
}
37+
38+
klog.V(4).Infof("Fetching %s", url)
39+
f := getRequestFunc(ctx, "GET", url, headers, nil)
40+
_, data, err := httpreq.DoRequest(f, true)
41+
if err != nil {
42+
return nil, err
43+
}
44+
45+
return parseComputeNodes(data)
46+
}
47+
48+
func (p *Provider) getComputeUrl() (string, map[string]string, *httperr.Error) {
49+
auth := p.cred.user + ":" + p.cred.passwd
50+
authHeader := "Basic " + base64.StdEncoding.EncodeToString([]byte(auth))
51+
headers := map[string]string{"Authorization": authHeader}
52+
53+
url, err := httpreq.GetURL(p.params.ApiURL, nil, ComputeURL)
54+
return url, headers, err
55+
}
56+
57+
func parseComputeNodes(data []byte) (topology.DomainMap, *httperr.Error) {
58+
var computeNodes []ComputeNode
59+
err := json.Unmarshal(data, &computeNodes)
60+
if err != nil {
61+
return nil, httperr.NewError(http.StatusBadGateway, fmt.Sprintf("nmx output read failed: %v", err))
62+
}
63+
64+
domainMap := topology.NewDomainMap()
65+
for _, node := range computeNodes {
66+
klog.V(4).Infof("Add NVL domain %q for node %q", node.DomainUUID, node.Name)
67+
domainMap.AddHost(node.DomainUUID, node.Name, node.Name)
68+
}
69+
70+
return domainMap, nil
71+
}

pkg/providers/netq/nmx_test.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright 2025 NVIDIA CORPORATION
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package netq
7+
8+
import (
9+
"os"
10+
"testing"
11+
12+
"github.com/NVIDIA/topograph/pkg/topology"
13+
"github.com/stretchr/testify/require"
14+
)
15+
16+
func TestGetComputeUrl(t *testing.T) {
17+
p := &Provider{
18+
params: &ProviderParams{},
19+
cred: &Credentials{user: "user", passwd: "passwd"},
20+
}
21+
22+
testCases := []struct {
23+
name string
24+
serverURL string
25+
headers map[string]string
26+
computeUrl string
27+
err string
28+
}{
29+
{
30+
name: "Case 1: invalid URL",
31+
serverURL: `:///server`,
32+
err: `parse ":///server": missing protocol scheme`,
33+
},
34+
{
35+
name: "Case 2: valid input",
36+
serverURL: `https://server.com`,
37+
headers: map[string]string{"Authorization": "Basic dXNlcjpwYXNzd2Q="},
38+
computeUrl: `https://server.com/nmx/v1/compute-nodes`,
39+
},
40+
}
41+
42+
for _, tc := range testCases {
43+
t.Run(tc.name, func(t *testing.T) {
44+
p.params.ApiURL = tc.serverURL
45+
url, headers, err := p.getComputeUrl()
46+
if len(tc.err) != 0 {
47+
require.NotNil(t, err)
48+
require.EqualError(t, err, tc.err)
49+
} else {
50+
require.Nil(t, err)
51+
require.Equal(t, tc.computeUrl, url)
52+
require.Equal(t, tc.headers, headers)
53+
}
54+
})
55+
}
56+
}
57+
58+
func TestParseComputeNodes(t *testing.T) {
59+
data, err := os.ReadFile("../../../tests/output/netq/computeNodes.json")
60+
require.NoError(t, err)
61+
62+
domains, err := parseComputeNodes(data)
63+
require.Nil(t, err)
64+
65+
expected := topology.NewDomainMap()
66+
expected.AddHost("3fbfc98b-7f95-4749-ab11-bd351a2aab3e", "node1", "node1")
67+
expected.AddHost("3fbfc98b-7f95-4749-ab11-bd351a2aab3e", "node2", "node2")
68+
69+
require.Equal(t, expected, domains)
70+
}

pkg/providers/netq/provider.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,24 @@ func getParams(params map[string]any) (*ProviderParams, error) {
8585
}
8686

8787
func (p *Provider) GenerateTopologyConfig(ctx context.Context, _ *int, instances []topology.ComputeInstances) (*topology.Vertex, *httperr.Error) {
88-
return p.generateTopologyConfig(ctx, instances)
88+
domains, err := p.getNvlDomains(ctx)
89+
if err != nil {
90+
return nil, err
91+
}
92+
93+
treeRoot, err := p.getNetworkTree(ctx, instances)
94+
if err != nil {
95+
return nil, err
96+
}
97+
98+
root := &topology.Vertex{
99+
Vertices: map[string]*topology.Vertex{
100+
topology.TopologyTree: treeRoot,
101+
topology.TopologyBlock: domains.ToBlocks(),
102+
},
103+
}
104+
105+
return root, nil
89106
}
90107

91108
// Instances2NodeMap implements slurm.instanceMapper
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
[
2+
{
3+
"CreatedAt": "2025-11-04T16:45:16.482Z",
4+
"Description": "Test",
5+
"DomainUUID": "3fbfc98b-7f95-4749-ab11-bd351a2aab3e",
6+
"GpuIDList": [
7+
"690a2d9cc5e905fa41a87cdc",
8+
"690a2d9cc5e905fa41a87cdd",
9+
"690a2d9cc5e905fa41a87cde",
10+
"690a2d9cc5e905fa41a87cdf"
11+
],
12+
"Health": "HEALTHY",
13+
"ID": "690a2d9cc5e905fa41a87d79",
14+
"LocationInfo": {
15+
"ChassisID": 1,
16+
"ChassisSerialNumber": "1825024170029",
17+
"HostID": 1,
18+
"SlotID": 5,
19+
"TrayIndex": 4
20+
},
21+
"Name": "node1",
22+
"UpdatedAt": "2025-11-05T18:37:47.155Z"
23+
},
24+
{
25+
"CreatedAt": "2025-11-04T16:45:16.482Z",
26+
"Description": "",
27+
"DomainUUID": "3fbfc98b-7f95-4749-ab11-bd351a2aab3e",
28+
"GpuIDList": [
29+
"690a2d9cc5e905fa41a87ce0",
30+
"690a2d9cc5e905fa41a87ce2",
31+
"690a2d9cc5e905fa41a87ce3",
32+
"690a2d9cc5e905fa41a87ce1"
33+
],
34+
"Health": "HEALTHY",
35+
"ID": "690a2d9cc5e905fa41a87d7a",
36+
"LocationInfo": {
37+
"ChassisID": 1,
38+
"ChassisSerialNumber": "1825024170029",
39+
"HostID": 1,
40+
"SlotID": 22,
41+
"TrayIndex": 12
42+
},
43+
"Name": "node2",
44+
"UpdatedAt": "2025-11-05T18:37:47.155Z"
45+
}
46+
]

0 commit comments

Comments
 (0)