Skip to content

Commit e8bad66

Browse files
authored
fix: tone down logging and test case fixes (#269)
1 parent 7d534d7 commit e8bad66

File tree

4 files changed

+29
-11
lines changed

4 files changed

+29
-11
lines changed

.coderabbit.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
reviews:
16+
profile: chill
17+
auto_review:
18+
enabled: true

health-monitors/syslog-health-monitor/pkg/syslog-monitor/syslogmonitor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -662,14 +662,14 @@ func (sm *SyslogMonitor) processJournalEntries(journal Journal, check CheckDefin
662662
}
663663
// This entry (matched or not) is considered processed.
664664
sm.checkLastCursors[check.Name] = currentEntryCursor // Update cursor for the next run
665-
slog.Info("Check, considered processed", "name", check.Name,
665+
slog.Debug("Check errored but considered processed", "name", check.Name,
666666
"message", message,
667667
"cursor", currentEntryCursor)
668668
}
669669

670670
advancedNext, advErr := journal.Next()
671671
if advErr == io.EOF || advancedNext == 0 { //nolint:errorlint // TODO
672-
slog.Info("Check, no more", "name", check.Name, "cursor", currentEntryCursor)
672+
slog.Info("Check no more", "name", check.Name, "cursor", currentEntryCursor)
673673
// sm.checkLastCursors[checkName] is already set to currentEntryCursor.
674674
break
675675
}

health-monitors/syslog-health-monitor/pkg/xid/parser/sidecar.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,12 @@ type SidecarParser struct {
3535

3636
// NewSidecarParser creates a new sidecar parser
3737
func NewSidecarParser(endpoint, nodeName string) *SidecarParser {
38+
c := retryablehttp.NewClient()
39+
c.Logger = slog.With("http", "retryablehttp-client")
40+
3841
return &SidecarParser{
3942
url: fmt.Sprintf("%s/decode-xid", endpoint),
40-
client: retryablehttp.NewClient(),
43+
client: c,
4144
nodeName: nodeName,
4245
}
4346
}
@@ -74,7 +77,7 @@ func (p *SidecarParser) Parse(message string) (*Response, error) {
7477
defer resp.Body.Close()
7578

7679
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
77-
slog.Error("HTTP request failed", "statusCode", resp.StatusCode)
80+
slog.Debug("HTTP request failed", "statusCode", resp.StatusCode)
7881
metrics.XidProcessingErrors.WithLabelValues("http_status_error", p.nodeName).Inc()
7982

8083
return nil, fmt.Errorf("HTTP request failed with status code: %d", resp.StatusCode)

tests/uat/tests.sh

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ test_gpu_monitoring_dcgm() {
7676
log "========================================="
7777

7878
local gpu_node
79-
gpu_node=$(kubectl get nodes -l workload-type=gpu -o jsonpath='{.items[0].metadata.name}')
79+
gpu_node=$(kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}')
8080

8181
if [[ -z "$gpu_node" ]]; then
8282
error "No GPU nodes found"
@@ -103,7 +103,7 @@ test_gpu_monitoring_dcgm() {
103103
local max_wait=30
104104
local waited=0
105105
while [[ $waited -lt $max_wait ]]; do
106-
conditions_count=$(kubectl get node "$gpu_node" -o json | jq '[.status.conditions[] | select(.type == "GpuInforomWatch" or .type == "GpuPcieWatch")] | length')
106+
conditions_count=$(kubectl get node "$gpu_node" -o json | jq '[.status.conditions[] | select(.status == "True" and (.type == "GpuInforomWatch" or .type == "GpuPcieWatch"))] | length')
107107
if [[ "$conditions_count" -ge 2 ]]; then
108108
log "Found $conditions_count node conditions"
109109
break
@@ -145,7 +145,7 @@ test_xid_monitoring_syslog() {
145145
log "========================================="
146146

147147
local gpu_node
148-
gpu_node=$(kubectl get nodes -l workload-type=gpu -o jsonpath='{.items[0].metadata.name}')
148+
gpu_node=$(kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}')
149149

150150
if [[ -z "$gpu_node" ]]; then
151151
error "No GPU nodes found"
@@ -179,7 +179,7 @@ test_sxid_monitoring_syslog() {
179179
log "========================================="
180180

181181
local gpu_node
182-
gpu_node=$(kubectl get nodes -l workload-type=gpu -o jsonpath='{.items[0].metadata.name}')
182+
gpu_node=$(kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}')
183183

184184
if [[ -z "$gpu_node" ]]; then
185185
error "No GPU nodes found"
@@ -254,9 +254,6 @@ test_sxid_monitoring_syslog() {
254254
fi
255255
log "Node event verified: SysLogsSXIDError ✓"
256256

257-
log "Waiting for node to be quarantined and rebooted..."
258-
wait_for_boot_id_change "$gpu_node" "$original_boot_id"
259-
260257
log "Test 3 PASSED ✓"
261258
}
262259

0 commit comments

Comments
 (0)