diff --git a/pkg/providers/netq/netq.go b/pkg/providers/netq/netq.go index 7c040c4..9baddb4 100644 --- a/pkg/providers/netq/netq.go +++ b/pkg/providers/netq/netq.go @@ -23,7 +23,7 @@ import ( const ( LoginURL = "auth/v1/login" OpIdURL = "auth/v1/select/opid" - TopologyURL = "telemetry/v1/object/topologygraph/fetch-topology" + TopologyURL = "api/netq/telemetry/v1/object/topologygraph/fetch-topology" ) type NetqResponse struct { @@ -49,7 +49,7 @@ type AuthOutput struct { AccessToken string `json:"access_token"` } -func (p *Provider) generateTopologyConfig(ctx context.Context, cis []topology.ComputeInstances) (*topology.Vertex, *httperr.Error) { +func (p *Provider) getNetworkTree(ctx context.Context, cis []topology.ComputeInstances) (*topology.Vertex, *httperr.Error) { // 1. login to NetQ server payload := strings.NewReader(fmt.Sprintf(`{"username":%q, "password":%q}`, p.cred.user, p.cred.passwd)) headers := map[string]string{ @@ -242,9 +242,5 @@ func parseNetq(resp []NetqResponse, inputNodes map[string]bool) (*topology.Verte treeRoot.Vertices[node.ID] = node } - root := &topology.Vertex{ - Vertices: map[string]*topology.Vertex{topology.TopologyTree: treeRoot}, - } - - return root, nil + return treeRoot, nil } diff --git a/pkg/providers/netq/netq_test.go b/pkg/providers/netq/netq_test.go index c733575..42c0359 100644 --- a/pkg/providers/netq/netq_test.go +++ b/pkg/providers/netq/netq_test.go @@ -104,11 +104,11 @@ func TestParseNetq(t *testing.T) { Nodes: nodes, }} - root, err := parseNetq(netqResponse, map[string]bool{"A": true}) + treeRoot, err := parseNetq(netqResponse, map[string]bool{"A": true}) require.Nil(t, err) top := []*topology.Vertex{} - for _, v := range root.Vertices[topology.TopologyTree].Vertices { + for _, v := range treeRoot.Vertices { top = append(top, v) } diff --git a/pkg/providers/netq/nmx.go b/pkg/providers/netq/nmx.go new file mode 100644 index 0000000..e28b0b3 --- /dev/null +++ b/pkg/providers/netq/nmx.go @@ -0,0 +1,71 @@ +/* + * Copyright 2025 NVIDIA CORPORATION + * SPDX-License-Identifier: Apache-2.0 + */ + +package netq + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "net/http" + + "k8s.io/klog/v2" + + "github.com/NVIDIA/topograph/internal/httperr" + "github.com/NVIDIA/topograph/internal/httpreq" + "github.com/NVIDIA/topograph/pkg/topology" +) + +const ( + ComputeURL = "nmx/v1/compute-nodes" +) + +type ComputeNode struct { + Id string `json:"ID"` + Name string `json:"Name"` + DomainUUID string `json:"DomainUUID"` +} + +func (p *Provider) getNvlDomains(ctx context.Context) (topology.DomainMap, *httperr.Error) { + url, headers, httpErr := p.getComputeUrl() + if httpErr != nil { + return nil, httpErr + } + + klog.V(4).Infof("Fetching %s", url) + f := getRequestFunc(ctx, "GET", url, headers, nil) + _, data, err := httpreq.DoRequest(f, true) + if err != nil { + return nil, err + } + + return parseComputeNodes(data) +} + +func (p *Provider) getComputeUrl() (string, map[string]string, *httperr.Error) { + auth := p.cred.user + ":" + p.cred.passwd + authHeader := "Basic " + base64.StdEncoding.EncodeToString([]byte(auth)) + headers := map[string]string{"Authorization": authHeader} + + url, err := httpreq.GetURL(p.params.ApiURL, nil, ComputeURL) + return url, headers, err +} + +func parseComputeNodes(data []byte) (topology.DomainMap, *httperr.Error) { + var computeNodes []ComputeNode + err := json.Unmarshal(data, &computeNodes) + if err != nil { + return nil, httperr.NewError(http.StatusBadGateway, fmt.Sprintf("nmx output read failed: %v", err)) + } + + domainMap := topology.NewDomainMap() + for _, node := range computeNodes { + klog.V(4).Infof("Add NVL domain %q for node %q", node.DomainUUID, node.Name) + domainMap.AddHost(node.DomainUUID, node.Name, node.Name) + } + + return domainMap, nil +} diff --git a/pkg/providers/netq/nmx_test.go b/pkg/providers/netq/nmx_test.go new file mode 100644 index 0000000..3ded6d6 --- /dev/null +++ b/pkg/providers/netq/nmx_test.go @@ -0,0 +1,70 @@ +/* + * Copyright 2025 NVIDIA CORPORATION + * SPDX-License-Identifier: Apache-2.0 + */ + +package netq + +import ( + "os" + "testing" + + "github.com/NVIDIA/topograph/pkg/topology" + "github.com/stretchr/testify/require" +) + +func TestGetComputeUrl(t *testing.T) { + p := &Provider{ + params: &ProviderParams{}, + cred: &Credentials{user: "user", passwd: "passwd"}, + } + + testCases := []struct { + name string + serverURL string + headers map[string]string + computeUrl string + err string + }{ + { + name: "Case 1: invalid URL", + serverURL: `:///server`, + err: `parse ":///server": missing protocol scheme`, + }, + { + name: "Case 2: valid input", + serverURL: `https://server.com`, + headers: map[string]string{"Authorization": "Basic dXNlcjpwYXNzd2Q="}, + computeUrl: `https://server.com/nmx/v1/compute-nodes`, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + p.params.ApiURL = tc.serverURL + url, headers, err := p.getComputeUrl() + if len(tc.err) != 0 { + require.NotNil(t, err) + require.EqualError(t, err, tc.err) + } else { + require.Nil(t, err) + require.Equal(t, tc.computeUrl, url) + require.Equal(t, tc.headers, headers) + } + }) + } +} + +func TestParseComputeNodes(t *testing.T) { + data, err := os.ReadFile("../../../tests/output/netq/computeNodes.json") + require.NoError(t, err) + + domains, err := parseComputeNodes(data) + require.Nil(t, err) + + expected := topology.NewDomainMap() + expected.AddHost("3fbfc98b-7f95-4749-ab11-bd351a2aab3e", "node1", "node1") + expected.AddHost("3fbfc98b-7f95-4749-ab11-bd351a2aab3e", "node2", "node2") + + require.Equal(t, expected, domains) +} diff --git a/pkg/providers/netq/provider.go b/pkg/providers/netq/provider.go index 89c4d14..b9d8081 100644 --- a/pkg/providers/netq/provider.go +++ b/pkg/providers/netq/provider.go @@ -10,6 +10,8 @@ import ( "fmt" "net/http" + "k8s.io/klog/v2" + "github.com/NVIDIA/topograph/internal/config" "github.com/NVIDIA/topograph/internal/httperr" "github.com/NVIDIA/topograph/pkg/providers" @@ -85,7 +87,27 @@ func getParams(params map[string]any) (*ProviderParams, error) { } func (p *Provider) GenerateTopologyConfig(ctx context.Context, _ *int, instances []topology.ComputeInstances) (*topology.Vertex, *httperr.Error) { - return p.generateTopologyConfig(ctx, instances) + domains, err := p.getNvlDomains(ctx) + if err != nil { + klog.Warningf("Failed to get NVL domains: %v", err) + } + + treeRoot, err := p.getNetworkTree(ctx, instances) + if err != nil { + return nil, err + } + + root := &topology.Vertex{ + Vertices: map[string]*topology.Vertex{ + topology.TopologyTree: treeRoot, + }, + } + + if domains != nil { + root.Vertices[topology.TopologyBlock] = domains.ToBlocks() + } + + return root, nil } // Instances2NodeMap implements slurm.instanceMapper diff --git a/tests/output/netq/computeNodes.json b/tests/output/netq/computeNodes.json new file mode 100644 index 0000000..73e3aeb --- /dev/null +++ b/tests/output/netq/computeNodes.json @@ -0,0 +1,46 @@ +[ + { + "CreatedAt": "2025-11-04T16:45:16.482Z", + "Description": "Test", + "DomainUUID": "3fbfc98b-7f95-4749-ab11-bd351a2aab3e", + "GpuIDList": [ + "690a2d9cc5e905fa41a87cdc", + "690a2d9cc5e905fa41a87cdd", + "690a2d9cc5e905fa41a87cde", + "690a2d9cc5e905fa41a87cdf" + ], + "Health": "HEALTHY", + "ID": "690a2d9cc5e905fa41a87d79", + "LocationInfo": { + "ChassisID": 1, + "ChassisSerialNumber": "1825024170029", + "HostID": 1, + "SlotID": 5, + "TrayIndex": 4 + }, + "Name": "node1", + "UpdatedAt": "2025-11-05T18:37:47.155Z" + }, + { + "CreatedAt": "2025-11-04T16:45:16.482Z", + "Description": "", + "DomainUUID": "3fbfc98b-7f95-4749-ab11-bd351a2aab3e", + "GpuIDList": [ + "690a2d9cc5e905fa41a87ce0", + "690a2d9cc5e905fa41a87ce2", + "690a2d9cc5e905fa41a87ce3", + "690a2d9cc5e905fa41a87ce1" + ], + "Health": "HEALTHY", + "ID": "690a2d9cc5e905fa41a87d7a", + "LocationInfo": { + "ChassisID": 1, + "ChassisSerialNumber": "1825024170029", + "HostID": 1, + "SlotID": 22, + "TrayIndex": 12 + }, + "Name": "node2", + "UpdatedAt": "2025-11-05T18:37:47.155Z" + } +]