Skip to content

Commit cdedd0a

Browse files
committed
feat: Implement log collector mock mode and KWOK failure injection for testing
1 parent 3f3c256 commit cdedd0a

File tree

14 files changed

+509
-67
lines changed

14 files changed

+509
-67
lines changed

distros/kubernetes/nvsentinel/charts/fault-remediation/files/log-collector-job.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ spec:
3636
{{- end }}
3737
containers:
3838
- name: log-collector
39-
image: {{ .Values.logCollector.image.repository }}:{{ .Values.global.image.tag | default "latest" }}
39+
image: {{ .Values.logCollector.image.repository }}:{{ .Values.logCollector.image.tag | default .Values.global.image.tag | default "latest" }}
4040
imagePullPolicy: {{ .Values.logCollector.image.pullPolicy }}
4141
securityContext:
4242
privileged: true
@@ -61,6 +61,10 @@ spec:
6161
value: {{ .Values.logCollector.enableGcpSosCollection | quote }}
6262
- name: ENABLE_AWS_SOS_COLLECTION
6363
value: {{ .Values.logCollector.enableAwsSosCollection | quote }}
64+
{{- range $key, $value := .Values.logCollector.env }}
65+
- name: {{ $key }}
66+
value: {{ $value | quote }}
67+
{{- end }}
6468
volumeMounts:
6569
- name: artifacts
6670
mountPath: /artifacts

distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ logCollector:
9696
enabled: false
9797
image:
9898
repository: ghcr.io/nvidia/nvsentinel/log-collector
99+
# tag: latest # Optional: Override global.image.tag for log-collector
99100
pullPolicy: IfNotPresent
100101
# HTTP endpoint where collected logs will be uploaded
101102
uploadURL: "http://nvsentinel-incluster-file-server.nvsentinel.svc.cluster.local/upload"
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
##
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
##
16+
17+
# KWOK Stage to inject failure into log-collector jobs on KWOK nodes
18+
# This stage makes jobs with label 'test-scenario=log-collector-failure' fail
19+
apiVersion: kwok.x-k8s.io/v1alpha1
20+
kind: Stage
21+
metadata:
22+
name: job-log-collector-failure
23+
spec:
24+
resourceRef:
25+
apiGroup: batch
26+
kind: Job
27+
selector:
28+
matchLabels:
29+
test-scenario: log-collector-failure
30+
delay:
31+
durationMilliseconds: 1000
32+
next:
33+
statusTemplate: |
34+
conditions:
35+
- type: Failed
36+
status: "True"
37+
reason: BackoffLimitExceeded
38+
message: "Job has reached the specified backoff limit"
39+
lastProbeTime: {{ now }}
40+
lastTransitionTime: {{ now }}
41+
failed: 1
42+
startTime: {{ now }}
43+
completionTime: {{ now }}
44+

distros/kubernetes/nvsentinel/values-tilt.yaml

100644100755
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,17 @@ fault-quarantine:
121121
fault-remediation:
122122
logLevel: debug
123123

124+
logCollector:
125+
enabled: true
126+
image:
127+
repository: localhost:5001/ghcr.io_nvidia_nvsentinel_log-collector
128+
tag: latest # Use latest tag built by Tilt
129+
pullPolicy: Always # Always pull latest Tilt-built image
130+
env:
131+
MOCK_MODE: "false"
132+
MOCK_EXIT_CODE: "0"
133+
MOCK_SLEEP_DURATION: "2"
134+
124135
affinity:
125136
podAntiAffinity:
126137
requiredDuringSchedulingIgnoredDuringExecution:

fault-remediation/pkg/reconciler/remediation.go

100644100755
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,10 @@ func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeNam
300300
// Set target node
301301
job.Spec.Template.Spec.NodeName = nodeName
302302

303+
// Check for test-scenario annotation on the node and propagate to job and pod labels
304+
log.Printf("[DEBUG] About to check test-scenario annotation for node %s", nodeName)
305+
c.propagateTestScenarioLabel(ctx, nodeName, job)
306+
303307
// Create Job using typed client
304308
created, err := c.kubeClient.BatchV1().Jobs(job.Namespace).Create(ctx, job, metav1.CreateOptions{})
305309
if err != nil {
@@ -415,3 +419,44 @@ func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeNam
415419
return result
416420
}
417421
}
422+
423+
// propagateTestScenarioLabel checks for test-scenario annotation on node and propagates to job/pod labels.
424+
func (c *FaultRemediationClient) propagateTestScenarioLabel(ctx context.Context, nodeName string, job *batchv1.Job) {
425+
log.Printf("[DEBUG] propagateTestScenarioLabel called for node %s", nodeName)
426+
427+
node, err := c.kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
428+
if err != nil {
429+
log.Printf("Warning: failed to get node %s for annotation check: %v", nodeName, err)
430+
return
431+
}
432+
433+
log.Printf("[DEBUG] Node %s annotations: %v", nodeName, node.Annotations)
434+
435+
testScenario, ok := node.Annotations["nvsentinel.nvidia.com/test-scenario"]
436+
if !ok {
437+
log.Printf("[DEBUG] No test-scenario annotation found on node %s", nodeName)
438+
return
439+
}
440+
441+
log.Printf("[DEBUG] Found test-scenario annotation: %s", testScenario)
442+
443+
// Set label on Job metadata
444+
if job.Labels == nil {
445+
job.Labels = make(map[string]string)
446+
}
447+
448+
job.Labels["test-scenario"] = testScenario
449+
450+
// Set label on Pod template metadata (so pods inherit it)
451+
if job.Spec.Template.Labels == nil {
452+
job.Spec.Template.Labels = make(map[string]string)
453+
}
454+
455+
job.Spec.Template.Labels["test-scenario"] = testScenario
456+
457+
log.Printf(
458+
"Applied test-scenario label '%s' to log collector job and pod template for node %s",
459+
testScenario,
460+
nodeName,
461+
)
462+
}

log-collector/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ RUN useradd -u 10001 -m nvsentinel
4242
WORKDIR /opt/log-collector
4343

4444
COPY log-collector/entrypoint.sh /opt/log-collector/entrypoint.sh
45-
RUN chmod +x /opt/log-collector/entrypoint.sh
45+
COPY log-collector/mock-nvidia-bug-report.sh /opt/log-collector/mock-nvidia-bug-report.sh
46+
COPY log-collector/mock-must-gather.sh /opt/log-collector/mock-must-gather.sh
47+
RUN chmod +x /opt/log-collector/entrypoint.sh /opt/log-collector/mock-nvidia-bug-report.sh /opt/log-collector/mock-must-gather.sh
4648

4749
ENV PATH="/opt/log-collector:${PATH}"
4850

log-collector/Tiltfile

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
docker_build(
16-
"ghcr.io/nvidia/nvsentinel/log-collector",
17-
context=".",
18-
dockerfile="./Dockerfile"
15+
# Build docker image and push to local registry with fixed tag
16+
# Using local_resource since the image is used in dynamically created jobs
17+
# Build from repo root since Dockerfile expects that context
18+
local_resource(
19+
"log-collector",
20+
"cd .. && docker build -t localhost:5001/ghcr.io_nvidia_nvsentinel_log-collector:latest -f log-collector/Dockerfile . && docker push localhost:5001/ghcr.io_nvidia_nvsentinel_log-collector:latest",
21+
deps=["./"],
22+
ignore=["./Tiltfile"],
23+
labels=["images"]
1924
)

log-collector/entrypoint.sh

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,28 @@ MUST_GATHER_SCRIPT_URL="${MUST_GATHER_SCRIPT_URL:-https://raw.githubusercontent.
3030
ENABLE_GCP_SOS_COLLECTION="${ENABLE_GCP_SOS_COLLECTION:-false}"
3131
ENABLE_AWS_SOS_COLLECTION="${ENABLE_AWS_SOS_COLLECTION:-false}"
3232

33+
# Mock mode for testing - prepends mock scripts to PATH
34+
MOCK_MODE="${MOCK_MODE:-false}"
35+
MOCK_EXIT_CODE="${MOCK_EXIT_CODE:-0}"
36+
37+
if [ "${MOCK_MODE}" = "true" ]; then
38+
echo "[MOCK] Enabling mock mode - using mock nvidia-bug-report.sh and must-gather.sh"
39+
MOCK_SCRIPTS_DIR="/opt/log-collector"
40+
41+
# Copy mock scripts and make them executable
42+
cp "${MOCK_SCRIPTS_DIR}/mock-nvidia-bug-report.sh" "${MOCK_SCRIPTS_DIR}/nvidia-bug-report.sh"
43+
cp "${MOCK_SCRIPTS_DIR}/mock-must-gather.sh" "${MOCK_SCRIPTS_DIR}/must-gather.sh"
44+
chmod +x "${MOCK_SCRIPTS_DIR}/nvidia-bug-report.sh" "${MOCK_SCRIPTS_DIR}/must-gather.sh"
45+
46+
# Prepend to PATH so mocks are used instead of real tools
47+
export PATH="${MOCK_SCRIPTS_DIR}:${PATH}"
48+
49+
# Override MUST_GATHER_SCRIPT_URL to use local mock instead of downloading
50+
MUST_GATHER_SCRIPT_URL="file://${MOCK_SCRIPTS_DIR}/must-gather.sh"
51+
52+
echo "[MOCK] Mock mode enabled. nvidia-bug-report.sh and must-gather.sh will use mock versions."
53+
fi
54+
3355
mkdir -p "${ARTIFACTS_DIR}"
3456
echo "[INFO] Target node: ${NODE_NAME} | GPU Operator namespace: ${GPU_OPERATOR_NAMESPACE} | Driver container: ${DRIVER_CONTAINER_NAME}"
3557

@@ -77,8 +99,16 @@ AWS_SOS_REPORT=""
7799
GCP_NVIDIA_BUG_REPORT="/host/home/kubernetes/bin/nvidia/bin/nvidia-bug-report.sh"
78100

79101
# 1) Collect nvidia-bug-report - auto-detect approach
102+
# In mock mode, use the local mock script directly
103+
if [ "${MOCK_MODE}" = "true" ]; then
104+
echo "[MOCK] Using local mock nvidia-bug-report.sh"
105+
BUG_REPORT_LOCAL_BASE="${ARTIFACTS_DIR}/nvidia-bug-report-${NODE_NAME}-${TIMESTAMP}"
106+
BUG_REPORT_LOCAL="${BUG_REPORT_LOCAL_BASE}.log.gz"
107+
nvidia-bug-report.sh --output-file "${BUG_REPORT_LOCAL_BASE}.log"
108+
echo "[MOCK] Bug report saved to ${BUG_REPORT_LOCAL}"
109+
80110
# Check if GCP COS nvidia-bug-report exists on the host filesystem (accessed via privileged container)
81-
if [ -f "${GCP_NVIDIA_BUG_REPORT}" ]; then
111+
elif [ -f "${GCP_NVIDIA_BUG_REPORT}" ]; then
82112
echo "[INFO] Found nvidia-bug-report at GCP COS location: ${GCP_NVIDIA_BUG_REPORT}"
83113

84114
# Use GCP COS approach - write directly to container filesystem

log-collector/mock-must-gather.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
# Mock GPU Operator must-gather for testing
3+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
4+
5+
set -e
6+
7+
echo "[MOCK] GPU Operator must-gather called"
8+
echo "[MOCK] Collecting mock diagnostic data..."
9+
10+
# Create mock must-gather directory structure
11+
mkdir -p ./namespaces ./logs ./cluster-info
12+
13+
# Generate simple mock files
14+
cat > ./cluster-info/info.txt <<EOF
15+
Mock GPU Operator Must-Gather
16+
Generated: $(date)
17+
Node: ${NODE_NAME:-unknown}
18+
Mock Mode: Enabled
19+
EOF
20+
21+
echo "Mock pod logs - $(date)" > ./logs/mock-pod.log
22+
echo "Mock namespace data" > ./namespaces/mock-ns.yaml
23+
24+
echo "[MOCK] Must-gather complete"
25+
exit 0
26+
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
# Mock nvidia-bug-report.sh for testing
3+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
4+
5+
set -e
6+
7+
echo "[MOCK] nvidia-bug-report.sh called"
8+
9+
# Parse arguments to find output file
10+
OUTPUT_FILE=""
11+
while [[ $# -gt 0 ]]; do
12+
case $1 in
13+
--output-file)
14+
OUTPUT_FILE="$2"
15+
shift 2
16+
;;
17+
*)
18+
shift
19+
;;
20+
esac
21+
done
22+
23+
if [ -z "$OUTPUT_FILE" ]; then
24+
echo "[MOCK ERROR] No output file specified" >&2
25+
exit 1
26+
fi
27+
28+
echo "[MOCK] Generating mock nvidia-bug-report to: ${OUTPUT_FILE}"
29+
30+
# Create a simple mock bug report and gzip it
31+
# Real nvidia-bug-report.sh automatically appends .gz to the output file
32+
# We mimic this behavior
33+
echo "Mock NVIDIA Bug Report - Node: ${NODE_NAME:-unknown}, Date: $(date)" | gzip > "${OUTPUT_FILE}.gz"
34+
35+
echo "[MOCK] Mock nvidia-bug-report created successfully at ${OUTPUT_FILE}.gz"
36+
exit 0
37+

0 commit comments

Comments
 (0)