Skip to content

Commit 79b0584

Browse files
authored
feat: integrate syslog health monitor with metadata-collector (#288)
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 04eb2b3 commit 79b0584

File tree

25 files changed

+1406
-1143
lines changed

25 files changed

+1406
-1143
lines changed

distros/kubernetes/nvsentinel/charts/metadata-collector/templates/daemonset.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ spec:
5050
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}"
5151
imagePullPolicy: {{ .Values.image.pullPolicy }}
5252
args:
53-
- --output-path={{ .Values.outputPath }}
53+
- --output-path={{ .Values.global.metadataPath }}
5454
env:
5555
- name: NODE_NAME
5656
valueFrom:

distros/kubernetes/nvsentinel/charts/metadata-collector/values.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,3 @@ resources:
3131
cpu: 100m
3232
memory: 128Mi
3333

34-
outputPath: /var/lib/nvsentinel/gpu_metadata.json
35-

distros/kubernetes/nvsentinel/charts/syslog-health-monitor/templates/_helpers.tpl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ spec:
9898
{{- end }}
9999
- "--checks"
100100
- "{{ join "," $root.Values.enabledChecks }}"
101+
- "--metadata-path"
102+
- "{{ $root.Values.global.metadataPath }}"
101103
resources:
102104
{{- toYaml $root.Values.resources | nindent 12 }}
103105
ports:
@@ -130,6 +132,9 @@ spec:
130132
mountPath: /var/run/
131133
- name: syslog-state-vol
132134
mountPath: /var/run/syslog_health_monitor
135+
- name: metadata-vol
136+
mountPath: /var/lib/nvsentinel
137+
readOnly: true
133138
{{- if $kataMode }}
134139
# Kata mode: Mount systemd journal for accessing host logs
135140
- name: host-journal
@@ -181,6 +186,10 @@ spec:
181186
hostPath:
182187
path: /var/run/syslog_health_monitor
183188
type: DirectoryOrCreate
189+
- name: metadata-vol
190+
hostPath:
191+
path: /var/lib/nvsentinel
192+
type: DirectoryOrCreate
184193
{{- if $kataMode }}
185194
# Kata mode: Systemd journal volumes for host log access
186195
- name: host-journal

distros/kubernetes/nvsentinel/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ global:
1818
tag: "main"
1919
metricsPort: 2112
2020

21+
# Shared metadata path used by metadata-collector and syslog-health-monitor
22+
metadataPath: /var/lib/nvsentinel/gpu_metadata.json
23+
2124
nodeSelector: {}
2225
tolerations: []
2326
affinity: {}

health-monitors/syslog-health-monitor/Dockerfile

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
ARG CUDA_VERSION="12.1.1-base-ubuntu22.04"
16-
1715
FROM public.ecr.aws/docker/library/golang:1.25-trixie AS builder
1816

1917
ARG BUILD_TAGS="systemd"
@@ -47,16 +45,17 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
4745
RUN cd health-monitors/syslog-health-monitor && \
4846
CGO_ENABLED=1 go build -tags "${BUILD_TAGS}" -ldflags="-s -w" -o syslog-health-monitor main.go
4947

50-
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION} AS runtime
48+
# Runtime stage - using Debian slim for systemd journal support
49+
FROM public.ecr.aws/docker/library/debian:bookworm-slim AS runtime
5150

52-
RUN apt-get update && apt-get install -y \
53-
systemd-sysv \
54-
--no-install-recommends \
51+
# Install required system libraries for systemd journal
52+
RUN apt-get update && apt-get install -y --no-install-recommends \
53+
libsystemd0 \
54+
liblz4-1 \
55+
libzstd1 \
56+
ca-certificates \
5557
&& rm -rf /var/lib/apt/lists/*
5658

5759
COPY --from=builder /go/src/nvsentinel/health-monitors/syslog-health-monitor/syslog-health-monitor /app/syslog-health-monitor
5860

59-
RUN apt-get update && \
60-
apt-get install -y libsystemd0 liblz4-1 libzstd1
61-
6261
ENTRYPOINT ["/app/syslog-health-monitor"]

health-monitors/syslog-health-monitor/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ var (
6565
"Endpoint to the XID analyser service.")
6666
kataEnabled = flag.String("kata-enabled", "false",
6767
"Indicates if this monitor is running in Kata Containers mode (set by DaemonSet variant).")
68+
metadataPath = flag.String("metadata-path", "/var/lib/nvsentinel/gpu_metadata.json",
69+
"Path to GPU metadata JSON file.")
6870
)
6971

7072
var checks []fd.CheckDefinition
@@ -166,6 +168,7 @@ func run() error {
166168
*pollingIntervalFlag,
167169
*stateFileFlag,
168170
*xidAnalyserEndpoint,
171+
*metadataPath,
169172
)
170173
if err != nil {
171174
return fmt.Errorf("error creating syslog health monitor: %w", err)
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package metadata
16+
17+
import (
18+
"encoding/json"
19+
"fmt"
20+
"log/slog"
21+
"os"
22+
"strings"
23+
"sync"
24+
25+
"github.com/nvidia/nvsentinel/data-models/pkg/model"
26+
)
27+
28+
type Reader struct {
29+
path string
30+
31+
once sync.Once
32+
loadErr error
33+
34+
metadata *model.GPUMetadata
35+
36+
pciToGPU map[string]*model.GPUInfo
37+
nvswitchLinks map[string]map[int]*gpuLinkInfo
38+
}
39+
40+
type gpuLinkInfo struct {
41+
GPU *model.GPUInfo
42+
LocalLinkID int
43+
}
44+
45+
func NewReader(path string) *Reader {
46+
return &Reader{
47+
path: path,
48+
}
49+
}
50+
51+
func (r *Reader) ensureLoaded() error {
52+
r.once.Do(func() {
53+
r.loadErr = r.load()
54+
})
55+
56+
return r.loadErr
57+
}
58+
59+
func (r *Reader) load() error {
60+
data, err := os.ReadFile(r.path)
61+
if err != nil {
62+
return fmt.Errorf("failed to read metadata file: %w", err)
63+
}
64+
65+
var metadata model.GPUMetadata
66+
if err := json.Unmarshal(data, &metadata); err != nil {
67+
return fmt.Errorf("failed to parse metadata JSON: %w", err)
68+
}
69+
70+
r.metadata = &metadata
71+
r.buildMaps()
72+
73+
slog.Info("GPU metadata loaded",
74+
"gpus", len(metadata.GPUs),
75+
"nvswitches", len(metadata.NVSwitches),
76+
"chassis_serial", metadata.ChassisSerial != nil)
77+
78+
return nil
79+
}
80+
81+
func (r *Reader) buildMaps() {
82+
r.pciToGPU = make(map[string]*model.GPUInfo)
83+
r.nvswitchLinks = make(map[string]map[int]*gpuLinkInfo)
84+
85+
for i := range r.metadata.GPUs {
86+
gpu := &r.metadata.GPUs[i]
87+
normPCI := normalizePCI(gpu.PCIAddress)
88+
r.pciToGPU[normPCI] = gpu
89+
90+
for _, link := range gpu.NVLinks {
91+
remotePCI := normalizePCI(link.RemotePCIAddress)
92+
93+
if r.nvswitchLinks[remotePCI] == nil {
94+
r.nvswitchLinks[remotePCI] = make(map[int]*gpuLinkInfo)
95+
}
96+
97+
r.nvswitchLinks[remotePCI][link.RemoteLinkID] = &gpuLinkInfo{
98+
GPU: gpu,
99+
LocalLinkID: link.LinkID,
100+
}
101+
}
102+
}
103+
}
104+
105+
func (r *Reader) GetGPUByPCI(pci string) (*model.GPUInfo, error) {
106+
if err := r.ensureLoaded(); err != nil {
107+
return nil, fmt.Errorf("failed to load metadata for PCI lookup %s: %w", pci, err)
108+
}
109+
110+
normPCI := normalizePCI(pci)
111+
gpu, ok := r.pciToGPU[normPCI]
112+
113+
if !ok {
114+
return nil, fmt.Errorf("GPU not found for PCI address: %s", pci)
115+
}
116+
117+
return gpu, nil
118+
}
119+
120+
func (r *Reader) GetGPUByNVSwitchLink(nvswitchPCI string, linkID int) (*model.GPUInfo, int, error) {
121+
if err := r.ensureLoaded(); err != nil {
122+
return nil, -1, fmt.Errorf("failed to load metadata for NVSwitch lookup %s link %d: %w", nvswitchPCI, linkID, err)
123+
}
124+
125+
normPCI := normalizePCI(nvswitchPCI)
126+
links, ok := r.nvswitchLinks[normPCI]
127+
128+
if !ok {
129+
return nil, -1, fmt.Errorf("NVSwitch not found: %s", nvswitchPCI)
130+
}
131+
132+
info, ok := links[linkID]
133+
134+
if !ok {
135+
return nil, -1, fmt.Errorf("link %d not found on NVSwitch %s", linkID, nvswitchPCI)
136+
}
137+
138+
return info.GPU, info.LocalLinkID, nil
139+
}
140+
141+
func (r *Reader) GetChassisSerial() *string {
142+
if err := r.ensureLoaded(); err != nil {
143+
return nil
144+
}
145+
146+
return r.metadata.ChassisSerial
147+
}
148+
149+
func normalizePCI(pci string) string {
150+
parts := strings.Split(pci, ":")
151+
if len(parts) != 3 {
152+
return strings.ToLower(pci)
153+
}
154+
155+
domain := parts[0]
156+
if len(domain) > 4 {
157+
domain = domain[len(domain)-4:]
158+
}
159+
160+
busDeviceFunc := parts[2]
161+
if idx := strings.Index(busDeviceFunc, "."); idx != -1 {
162+
busDeviceFunc = busDeviceFunc[:idx]
163+
}
164+
165+
return fmt.Sprintf("%s:%s:%s",
166+
strings.ToLower(domain),
167+
strings.ToLower(parts[1]),
168+
strings.ToLower(busDeviceFunc))
169+
}

0 commit comments

Comments
 (0)