Skip to content

Commit 19b0c91

Browse files
authored
feat: add application logic for metadata collector (#278)
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 27ca26d commit 19b0c91

File tree

12 files changed

+1132
-44
lines changed

12 files changed

+1132
-44
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package model
16+
17+
type GPUMetadata struct {
18+
Version string `json:"version"`
19+
Timestamp string `json:"timestamp"`
20+
NodeName string `json:"node_name"`
21+
ChassisSerial *string `json:"chassis_serial"`
22+
GPUs []GPUInfo `json:"gpus"`
23+
NVSwitches []string `json:"nvswitches"`
24+
}
25+
26+
type GPUInfo struct {
27+
GPUID int `json:"gpu_id"`
28+
UUID string `json:"uuid"`
29+
PCIAddress string `json:"pci_address"`
30+
SerialNumber string `json:"serial_number"`
31+
DeviceName string `json:"device_name"`
32+
NVLinks []NVLink `json:"nvlinks"`
33+
}
34+
35+
type NVLink struct {
36+
LinkID int `json:"link_id"`
37+
RemotePCIAddress string `json:"remote_pci_address"`
38+
RemoteLinkID int `json:"remote_link_id"`
39+
}

metadata-collector/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ FROM golang:1.25-bookworm AS builder
1717
WORKDIR /workspace
1818

1919
COPY commons/ commons/
20-
20+
COPY data-models/ data-models/
2121

2222
COPY metadata-collector/go.mod metadata-collector/go.sum metadata-collector/
2323
WORKDIR /workspace/metadata-collector
@@ -27,7 +27,7 @@ COPY metadata-collector/ .
2727

2828
RUN CGO_ENABLED=1 GOOS=linux go build -a -ldflags="-s -w" -o metadata-collector main.go
2929

30-
FROM ubuntu:22.04
30+
FROM nvidia/cuda:12.3.0-base-ubuntu22.04
3131

3232
RUN groupadd -r nvsentinel && useradd -r -g nvsentinel nvsentinel
3333

metadata-collector/go.mod

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,25 @@ go 1.25
55
toolchain go1.25.3
66

77
require (
8-
github.com/NVIDIA/go-nvml v0.12.4-0
8+
github.com/NVIDIA/go-nvml v0.13.0-1
99
github.com/nvidia/nvsentinel/commons v0.0.0
10+
github.com/nvidia/nvsentinel/data-models v0.0.0
11+
github.com/stretchr/testify v1.11.1
1012
)
1113

12-
replace github.com/nvidia/nvsentinel/commons => ../commons
14+
require (
15+
github.com/davecgh/go-spew v1.1.1 // indirect
16+
github.com/pmezard/go-difflib v1.0.0 // indirect
17+
golang.org/x/net v0.46.0 // indirect
18+
golang.org/x/sys v0.37.0 // indirect
19+
golang.org/x/text v0.30.0 // indirect
20+
google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect
21+
google.golang.org/grpc v1.76.0 // indirect
22+
google.golang.org/protobuf v1.36.10 // indirect
23+
gopkg.in/yaml.v3 v3.0.1 // indirect
24+
)
25+
26+
replace (
27+
github.com/nvidia/nvsentinel/commons => ../commons
28+
github.com/nvidia/nvsentinel/data-models => ../data-models
29+
)

metadata-collector/go.sum

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,48 @@
1-
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
2-
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
1+
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
2+
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
33
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
44
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
5+
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
6+
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
7+
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
8+
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
9+
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
10+
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
11+
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
12+
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
13+
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
14+
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
515
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
616
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
717
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
818
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
19+
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
20+
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
21+
go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
22+
go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
23+
go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE=
24+
go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E=
25+
go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI=
26+
go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg=
27+
go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc=
28+
go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps=
29+
go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4=
30+
go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0=
31+
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
32+
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
33+
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
34+
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
35+
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
36+
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
37+
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
38+
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
39+
google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f h1:1FTH6cpXFsENbPR5Bu8NQddPSaUUE6NA2XdZdDSAJK4=
40+
google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
41+
google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A=
42+
google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c=
43+
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
44+
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
45+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
46+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
947
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
1048
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

metadata-collector/main.go

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,84 +15,92 @@
1515
package main
1616

1717
import (
18+
"context"
19+
"flag"
1820
"fmt"
1921
"log/slog"
2022
"os"
23+
"os/signal"
24+
"syscall"
2125

22-
"github.com/NVIDIA/go-nvml/pkg/nvml"
2326
"github.com/nvidia/nvsentinel/commons/pkg/logger"
27+
"github.com/nvidia/nvsentinel/metadata-collector/pkg/collector"
28+
"github.com/nvidia/nvsentinel/metadata-collector/pkg/nvml"
29+
"github.com/nvidia/nvsentinel/metadata-collector/pkg/writer"
2430
)
2531

2632
const (
27-
defaultAgentName = "metadata-collector"
33+
defaultAgentName = "metadata-collector"
34+
defaultOutputPath = "/var/lib/nvsentinel/gpu_metadata.json"
2835
)
2936

3037
var (
3138
version = "dev"
3239
commit = "none"
3340
date = "unknown"
41+
42+
outputPath = flag.String("output-path", defaultOutputPath, "Path to write the GPU metadata JSON file")
3443
)
3544

3645
func main() {
46+
flag.Parse()
47+
3748
logger.SetDefaultStructuredLogger(defaultAgentName, version)
3849
slog.Info("Starting metadata-collector", "version", version, "commit", commit, "date", date)
3950

40-
if err := run(); err != nil {
51+
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
52+
53+
if err := run(ctx); err != nil {
4154
slog.Error("Metadata collector failed", "error", err)
55+
cancel()
4256
os.Exit(1)
4357
}
58+
59+
cancel()
60+
slog.Info("Metadata collector completed successfully")
4461
}
4562

46-
func run() error {
47-
ret := nvml.Init()
48-
if ret != nvml.SUCCESS {
49-
return fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
63+
func run(ctx context.Context) error {
64+
slog.Info("Initializing NVML")
65+
66+
nvmlWrapper := &nvml.NVMLWrapper{}
67+
if err := nvmlWrapper.Init(); err != nil {
68+
return fmt.Errorf("failed to initialize NVML: %w", err)
5069
}
5170

5271
defer func() {
53-
ret := nvml.Shutdown()
54-
if ret != nvml.SUCCESS {
55-
slog.Error("Failed to shutdown NVML", "error", nvml.ErrorString(ret))
72+
if err := nvmlWrapper.Shutdown(); err != nil {
73+
slog.Error("Failed to shutdown NVML", "error", err)
5674
}
5775
}()
5876

59-
count, ret := nvml.DeviceGetCount()
60-
if ret != nvml.SUCCESS {
61-
return fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret))
62-
}
77+
slog.Info("Collecting GPU metadata")
6378

64-
hostname, _ := os.Hostname()
79+
metadataCollector := collector.NewCollector(nvmlWrapper)
6580

66-
slog.Info("GPU metadata collection started", "node", hostname, "gpu_count", count)
67-
68-
if nvmlVersion, ret := nvml.SystemGetNVMLVersion(); ret == nvml.SUCCESS {
69-
slog.Info("NVML version", "version", nvmlVersion)
81+
metadata, err := metadataCollector.Collect(ctx)
82+
if err != nil {
83+
return fmt.Errorf("failed to collect GPU metadata: %w", err)
7084
}
7185

72-
fmt.Printf("\n=== GPU Metadata Collector ===\n")
73-
fmt.Printf("Node: %s\n", hostname)
74-
fmt.Printf("GPUs Found: %d\n", count)
75-
76-
fmt.Println("\n=== GPU Details ===")
86+
slog.Info("GPU metadata collected",
87+
"node", metadata.NodeName,
88+
"gpu_count", len(metadata.GPUs),
89+
"nvswitch_count", len(metadata.NVSwitches),
90+
)
7791

78-
for i := range count {
79-
device, ret := nvml.DeviceGetHandleByIndex(i)
80-
if ret != nvml.SUCCESS {
81-
slog.Warn("Failed to get device", "gpu_id", i, "error", nvml.ErrorString(ret))
82-
continue
83-
}
92+
slog.Info("Writing metadata to file", "output_path", *outputPath)
8493

85-
name, _ := device.GetName()
86-
uuid, _ := device.GetUUID()
87-
88-
slog.Info("GPU discovered", "gpu_id", i, "name", name, "uuid", uuid)
94+
metadataWriter, err := writer.NewWriter(*outputPath)
95+
if err != nil {
96+
return fmt.Errorf("failed to create metadata writer: %w", err)
97+
}
8998

90-
fmt.Printf("\nGPU %d:\n", i)
91-
fmt.Printf(" Name: %s\n", name)
92-
fmt.Printf(" UUID: %s\n", uuid)
99+
if err := metadataWriter.Write(metadata); err != nil {
100+
return fmt.Errorf("failed to write metadata: %w", err)
93101
}
94102

95-
slog.Info("Metadata collector hello world completed successfully")
103+
slog.Info("Successfully wrote GPU metadata", "output_path", *outputPath)
96104

97105
return nil
98106
}
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package collector
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"log/slog"
21+
"os"
22+
"time"
23+
24+
gonvml "github.com/NVIDIA/go-nvml/pkg/nvml"
25+
"github.com/nvidia/nvsentinel/data-models/pkg/model"
26+
"github.com/nvidia/nvsentinel/metadata-collector/pkg/nvml"
27+
)
28+
29+
type Collector struct {
30+
nvml *nvml.NVMLWrapper
31+
}
32+
33+
func NewCollector(nvmlWrapper *nvml.NVMLWrapper) *Collector {
34+
return &Collector{
35+
nvml: nvmlWrapper,
36+
}
37+
}
38+
39+
func (c *Collector) Collect(ctx context.Context) (*model.GPUMetadata, error) {
40+
count, err := c.nvml.GetDeviceCount()
41+
if err != nil {
42+
return nil, fmt.Errorf("failed to get GPU device count: %w", err)
43+
}
44+
45+
nodeName, err := os.Hostname()
46+
if err != nil {
47+
return nil, fmt.Errorf("failed to get hostname: %w", err)
48+
}
49+
50+
deviceMap, parsedTopology, err := c.prepareTopologyData(ctx)
51+
if err != nil {
52+
return nil, fmt.Errorf("failed to prepare topology data: %w", err)
53+
}
54+
55+
metadata := &model.GPUMetadata{
56+
Version: "1.0",
57+
Timestamp: time.Now().UTC().Format(time.RFC3339),
58+
NodeName: nodeName,
59+
GPUs: make([]model.GPUInfo, 0, count),
60+
}
61+
62+
if err := c.collectGPUData(count, metadata, deviceMap, parsedTopology); err != nil {
63+
return nil, fmt.Errorf("failed to collect GPU data: %w", err)
64+
}
65+
66+
return metadata, nil
67+
}
68+
69+
func (c *Collector) prepareTopologyData(
70+
ctx context.Context,
71+
) (map[string]gonvml.Device, map[int]nvml.GPUNVLinkTopology, error) {
72+
slog.Info("Building device map for NVLink topology discovery")
73+
74+
deviceMap, err := c.nvml.BuildDeviceMap()
75+
if err != nil {
76+
return nil, nil, fmt.Errorf("failed to build device map: %w", err)
77+
}
78+
79+
slog.Info("Parsing NVLink topology from nvidia-smi")
80+
81+
parsedTopology, err := c.nvml.ParseNVLinkTopologyWithContext(ctx)
82+
if err != nil {
83+
slog.Warn("Failed to parse nvidia-smi topology, remote link IDs will be -1", "error", err)
84+
85+
parsedTopology = make(map[int]nvml.GPUNVLinkTopology)
86+
}
87+
88+
return deviceMap, parsedTopology, nil
89+
}
90+
91+
func (c *Collector) collectGPUData(
92+
count int,
93+
metadata *model.GPUMetadata,
94+
deviceMap map[string]gonvml.Device,
95+
parsedTopology map[int]nvml.GPUNVLinkTopology,
96+
) error {
97+
nvswitchSet := make(map[string]bool)
98+
99+
var chassisSerial *string
100+
101+
for i := range count {
102+
nvmlGPUInfo, err := c.nvml.GetGPUInfo(i)
103+
if err != nil {
104+
return fmt.Errorf("failed to get GPU info for GPU %d: %w", i, err)
105+
}
106+
107+
if i == 0 {
108+
chassisSerial = c.nvml.GetChassisSerial(i)
109+
}
110+
111+
nvswitches, err := c.nvml.CollectNVLinkTopology(nvmlGPUInfo, i, deviceMap, parsedTopology)
112+
if err != nil {
113+
slog.Warn("Failed to collect NVLink topology for GPU", "gpu_id", i, "error", err)
114+
} else {
115+
for pci := range nvswitches {
116+
nvswitchSet[pci] = true
117+
}
118+
}
119+
120+
metadata.GPUs = append(metadata.GPUs, *nvmlGPUInfo)
121+
}
122+
123+
metadata.ChassisSerial = chassisSerial
124+
125+
metadata.NVSwitches = make([]string, 0, len(nvswitchSet))
126+
for pci := range nvswitchSet {
127+
metadata.NVSwitches = append(metadata.NVSwitches, pci)
128+
}
129+
130+
return nil
131+
}

0 commit comments

Comments
 (0)