Skip to content

Commit 0af665d

Browse files
authored
Merge branch 'main' into nitijain/kace-303
2 parents 1daa2af + bcfe5f8 commit 0af665d

File tree

15 files changed

+969
-44
lines changed

15 files changed

+969
-44
lines changed

distros/kubernetes/nvsentinel/charts/syslog-health-monitor/templates/configmap.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,10 @@ data:
3535
- name: "SysLogsSXIDError"
3636
journalPath: "/nvsentinel/var/log/journal/"
3737
{{- end }}
38+
39+
- name: "SysLogsGPUFallenOff"
40+
{{- if .Values.global.kata.enabled }}
41+
tags:
42+
- "-u containerd.service"
43+
{{- end }}
44+
journalPath: "/nvsentinel/var/log/journal/"

health-monitors/gpu-health-monitor/gpu_health_monitor/platform_connector/platform_connector.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def clear_dcgm_connectivity_failure(self, timestamp: Timestamp):
118118
self.send_health_event_with_retries(health_events)
119119
except Exception as e:
120120
log.error(f"Exception while sending DCGM connectivity restored events: {e}")
121+
raise
121122

122123
def health_event_occurred(
123124
self, health_details: dict[str, dcgmtypes.HealthDetails], gpu_ids: list, serials: dict[int, str]
@@ -248,8 +249,8 @@ def health_event_occurred(
248249
try:
249250
self.send_health_event_with_retries(health_events)
250251
except Exception as e:
251-
log.error(f"Exception while sending health events: {e}. Events will be retried in next cycle.")
252-
# Don't crash - continue monitoring
252+
log.error(f"Exception while sending health events: {e}")
253+
self.entity_cache = {}
253254

254255
def get_recommended_action_from_dcgm_error_map(self, error_code):
255256
if error_code in self.dcgm_errors_info_dict:
@@ -322,4 +323,4 @@ def dcgm_connectivity_failed(self):
322323
self.send_health_event_with_retries(health_events)
323324
except Exception as e:
324325
log.error(f"Exception while sending DCGM connectivity failure events: {e}")
325-
# Don't crash - continue monitoring
326+
raise

health-monitors/syslog-health-monitor/pkg/common/common.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,16 @@ import (
2323
"strings"
2424

2525
pb "github.com/nvidia/nvsentinel/data-models/pkg/protos"
26+
"github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/patterns"
2627
"github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/types"
2728

2829
"github.com/thedatashed/xlsxreader"
2930
)
3031

32+
// XIDPattern is the canonical pattern for detecting XID errors.
33+
// Re-exported from the patterns package for convenience.
34+
var XIDPattern = patterns.XIDPattern
35+
3136
// NVIDIA XID Error Catalog - Embedded Excel File
3237
//
3338
// This embedded Excel file is sourced directly from NVIDIA's official documentation:
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package gpufallen
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"log/slog"
21+
"time"
22+
23+
pb "github.com/nvidia/nvsentinel/data-models/pkg/protos"
24+
"github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/common"
25+
26+
"google.golang.org/protobuf/types/known/timestamppb"
27+
)
28+
29+
// NewGPUFallenHandler creates a new GPUFallenHandler instance.
30+
func NewGPUFallenHandler(nodeName, defaultAgentName,
31+
defaultComponentClass, checkName string) (*GPUFallenHandler, error) {
32+
ctx, cancel := context.WithCancel(context.Background())
33+
34+
h := &GPUFallenHandler{
35+
nodeName: nodeName,
36+
defaultAgentName: defaultAgentName,
37+
defaultComponentClass: defaultComponentClass,
38+
checkName: checkName,
39+
recentXIDs: make(map[string]xidRecord),
40+
xidWindow: 5 * time.Minute, // Remember XIDs for 5 minutes
41+
cancelCleanup: cancel,
42+
}
43+
44+
// Start background cleanup goroutine to prevent unbounded memory growth
45+
go h.cleanupExpiredXIDs(ctx)
46+
47+
return h, nil
48+
}
49+
50+
// Close stops the background cleanup goroutine and releases resources.
51+
// Should be called when the handler is no longer needed (e.g., in tests or shutdown).
52+
func (h *GPUFallenHandler) Close() {
53+
if h.cancelCleanup != nil {
54+
h.cancelCleanup()
55+
}
56+
}
57+
58+
// SetXIDWindow sets the time window for tracking XID errors.
59+
// This is primarily used for testing with shorter time windows.
60+
func (h *GPUFallenHandler) SetXIDWindow(window time.Duration) {
61+
h.mu.Lock()
62+
defer h.mu.Unlock()
63+
h.xidWindow = window
64+
}
65+
66+
// ProcessLine processes a single syslog line and returns any generated health events.
67+
func (h *GPUFallenHandler) ProcessLine(message string) (*pb.HealthEvents, error) {
68+
// First check if this is an XID message and track it
69+
h.trackXIDIfPresent(message)
70+
71+
// Check if this is a GPU falling off error
72+
event := h.parseGPUFallenError(message)
73+
if event == nil {
74+
return nil, nil
75+
}
76+
77+
return h.createHealthEventFromError(event), nil
78+
}
79+
80+
// trackXIDIfPresent checks if the message contains an XID error and records it
81+
func (h *GPUFallenHandler) trackXIDIfPresent(message string) {
82+
matches := common.XIDPattern.FindStringSubmatch(message)
83+
if len(matches) < 3 {
84+
return // Not an XID message
85+
}
86+
87+
pciAddr := matches[1]
88+
xidCode := 0
89+
// Parse XID code (matches[2])
90+
if _, err := fmt.Sscanf(matches[2], "%d", &xidCode); err != nil {
91+
slog.Warn("Failed to parse XID code from message",
92+
"pci_address", pciAddr,
93+
"xid_string", matches[2],
94+
"error", err)
95+
96+
return
97+
}
98+
99+
h.mu.Lock()
100+
defer h.mu.Unlock()
101+
102+
h.recentXIDs[pciAddr] = xidRecord{
103+
timestamp: time.Now(),
104+
xidCode: xidCode,
105+
}
106+
}
107+
108+
// hasRecentXID checks if a PCI address has had an XID error within the time window
109+
// and opportunistically cleans up expired entries to prevent memory leaks
110+
func (h *GPUFallenHandler) hasRecentXID(pciAddr string) bool {
111+
h.mu.RLock()
112+
record, exists := h.recentXIDs[pciAddr]
113+
h.mu.RUnlock()
114+
115+
if !exists {
116+
return false
117+
}
118+
119+
// Check if the XID is still within the time window
120+
isRecent := time.Since(record.timestamp) < h.xidWindow
121+
122+
// If expired, remove it from the map to prevent memory leaks
123+
if !isRecent {
124+
h.mu.Lock()
125+
// Double-check after acquiring write lock (entry might have been updated)
126+
if record, exists := h.recentXIDs[pciAddr]; exists && time.Since(record.timestamp) >= h.xidWindow {
127+
delete(h.recentXIDs, pciAddr)
128+
}
129+
h.mu.Unlock()
130+
}
131+
132+
return isRecent
133+
}
134+
135+
// cleanupExpiredXIDs runs periodically to remove expired XID entries.
136+
// This prevents unbounded memory growth from XIDs on GPUs that never experience fallen-off errors.
137+
// The goroutine stops when the context is cancelled (via Close method).
138+
func (h *GPUFallenHandler) cleanupExpiredXIDs(ctx context.Context) {
139+
// Calculate cleanup interval proportional to xidWindow (xidWindow / 5)
140+
// This ensures tests with short windows don't wait too long for cleanup
141+
h.mu.RLock()
142+
cleanupInterval := h.xidWindow / 5
143+
h.mu.RUnlock()
144+
145+
ticker := time.NewTicker(cleanupInterval)
146+
defer ticker.Stop()
147+
148+
for {
149+
select {
150+
case <-ctx.Done():
151+
// Context cancelled, stop cleanup goroutine
152+
return
153+
case <-ticker.C:
154+
h.mu.Lock()
155+
now := time.Now()
156+
window := h.xidWindow // Read current window value
157+
158+
for pciAddr, record := range h.recentXIDs {
159+
if now.Sub(record.timestamp) >= window {
160+
delete(h.recentXIDs, pciAddr)
161+
}
162+
}
163+
h.mu.Unlock()
164+
}
165+
}
166+
}
167+
168+
func (h *GPUFallenHandler) parseGPUFallenError(message string) *gpuFallenErrorEvent {
169+
// First check if this message itself contains "Xid" in same message
170+
// If it has XID error it should be handled by XID handler
171+
if common.XIDPattern.MatchString(message) {
172+
return nil
173+
}
174+
175+
m := reGPUFallenPattern.FindStringSubmatch(message)
176+
if len(m) < 2 {
177+
return nil
178+
}
179+
180+
pciAddr := m[1]
181+
182+
// Check if this PCI address has had a recent XID error
183+
// If so, skip generating an event to avoid duplicates with XID handler
184+
if h.hasRecentXID(pciAddr) {
185+
return nil
186+
}
187+
188+
// Try to extract PCI ID if present in the message
189+
pciID := ""
190+
pciIDMatch := rePCIIDPattern.FindStringSubmatch(message)
191+
192+
if len(pciIDMatch) >= 2 {
193+
pciID = pciIDMatch[1]
194+
}
195+
196+
return &gpuFallenErrorEvent{
197+
pciAddr: pciAddr,
198+
pciID: pciID,
199+
message: message,
200+
}
201+
}
202+
203+
func (h *GPUFallenHandler) createHealthEventFromError(event *gpuFallenErrorEvent) *pb.HealthEvents {
204+
entitiesImpacted := []*pb.Entity{
205+
{EntityType: "PCI", EntityValue: event.pciAddr},
206+
}
207+
208+
// If PCI ID is there, add it as well
209+
if event.pciID != "" {
210+
entitiesImpacted = append(entitiesImpacted, &pb.Entity{
211+
EntityType: "PCI_ID", EntityValue: event.pciID,
212+
})
213+
}
214+
215+
// Increment metrics (node-level only to avoid cardinality explosion)
216+
gpuFallenCounterMetric.WithLabelValues(h.nodeName).Inc()
217+
218+
healthEvent := &pb.HealthEvent{
219+
Version: 1,
220+
Agent: h.defaultAgentName,
221+
CheckName: h.checkName,
222+
ComponentClass: h.defaultComponentClass,
223+
GeneratedTimestamp: timestamppb.New(time.Now()),
224+
EntitiesImpacted: entitiesImpacted,
225+
Message: event.message,
226+
IsFatal: true, // GPU falling off the bus is always fatal
227+
IsHealthy: false,
228+
NodeName: h.nodeName,
229+
RecommendedAction: pb.RecommendedAction_RESTART_BM,
230+
ErrorCode: []string{"GPU_FALLEN_OFF_BUS"},
231+
}
232+
233+
return &pb.HealthEvents{
234+
Version: 1,
235+
Events: []*pb.HealthEvent{healthEvent},
236+
}
237+
}

0 commit comments

Comments
 (0)