Skip to content

Commit 8330214

Browse files
committed
feat: Add mock mode for log collector with Tilt tests for HIPPO-2120
1 parent 727c185 commit 8330214

File tree

12 files changed

+715
-1
lines changed

12 files changed

+715
-1
lines changed

distros/kubernetes/nvsentinel/charts/fault-remediation/files/log-collector-job.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ spec:
6161
value: {{ .Values.logCollector.enableGcpSosCollection | quote }}
6262
- name: ENABLE_AWS_SOS_COLLECTION
6363
value: {{ .Values.logCollector.enableAwsSosCollection | quote }}
64+
- name: MOCK_MODE
65+
value: {{ .Values.logCollector.mockMode | quote }}
66+
- name: MOCK_EXIT_CODE
67+
value: {{ .Values.logCollector.mockExitCode | quote }}
68+
- name: MOCK_SLEEP_DURATION
69+
value: {{ .Values.logCollector.mockSleepDuration | quote }}
6470
volumeMounts:
6571
- name: artifacts
6672
mountPath: /artifacts

distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,10 @@ logCollector:
105105
enableGcpSosCollection: false
106106
# Enable AWS-specific SOS report collection
107107
enableAwsSosCollection: false
108+
# Mock mode for testing
109+
# Values: "true" (enable mock mode), "false" (disable mock mode)
110+
mockMode: false
111+
# Exit code to use in mock mode (0 for success, non-zero for failure scenarios)
112+
mockExitCode: 0
113+
# Sleep duration in seconds during mock mode to simulate collection time
114+
mockSleepDuration: 5

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@ fault-quarantine:
121121
fault-remediation:
122122
logLevel: debug
123123

124+
logCollector:
125+
enabled: true
126+
mockMode: "true"
127+
mockExitCode: "0"
128+
mockSleepDuration: "5"
129+
124130
affinity:
125131
podAntiAffinity:
126132
requiredDuringSchedulingIgnoredDuringExecution:

fault-remediation/pkg/reconciler/remediation.go

100644100755
Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,38 @@ func (c *FaultRemediationClient) handleCreateCRError(
270270
return false, ""
271271
}
272272

273+
// updateJobEnvVar updates an environment variable in the job spec.
274+
func updateJobEnvVar(job *batchv1.Job, envName, envValue string) {
275+
for i := range job.Spec.Template.Spec.Containers {
276+
for j := range job.Spec.Template.Spec.Containers[i].Env {
277+
if job.Spec.Template.Spec.Containers[i].Env[j].Name == envName {
278+
job.Spec.Template.Spec.Containers[i].Env[j].Value = envValue
279+
return
280+
}
281+
}
282+
}
283+
}
284+
285+
// applyNodeAnnotationsToJob applies test overrides from node annotations to the job spec.
286+
func (c *FaultRemediationClient) applyNodeAnnotationsToJob(ctx context.Context, job *batchv1.Job, nodeName string) {
287+
node, err := c.kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
288+
if err != nil || node.Annotations == nil {
289+
return
290+
}
291+
292+
// Override MOCK_EXIT_CODE from annotation (for test scenarios)
293+
if exitCodeStr, ok := node.Annotations["nvsentinel.nvidia.com/log-collector-mock-exit-code"]; ok {
294+
log.Printf("Overriding log collector mock exit code from node annotation: %s", exitCodeStr)
295+
updateJobEnvVar(job, "MOCK_EXIT_CODE", exitCodeStr)
296+
}
297+
298+
// Override MOCK_SLEEP_DURATION from annotation (for test scenarios)
299+
if sleepDurationStr, ok := node.Annotations["nvsentinel.nvidia.com/log-collector-mock-sleep"]; ok {
300+
log.Printf("Overriding log collector mock sleep duration from node annotation: %s", sleepDurationStr)
301+
updateJobEnvVar(job, "MOCK_SLEEP_DURATION", sleepDurationStr)
302+
}
303+
}
304+
273305
// RunLogCollectorJob creates a log collector Job and waits for completion.
274306
// nolint: cyclop // todo
275307
func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeName string) error {
@@ -297,6 +329,9 @@ func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeNam
297329
return fmt.Errorf("failed to unmarshal Job manifest: %w", err)
298330
}
299331

332+
// Apply test overrides from node annotations (for test scenarios)
333+
c.applyNodeAnnotationsToJob(ctx, job, nodeName)
334+
300335
// Set target node
301336
job.Spec.Template.Spec.NodeName = nodeName
302337

@@ -311,7 +346,7 @@ func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeNam
311346
log.Printf("Waiting for log collector job %s to complete", created.Name)
312347

313348
// Use a context with timeout for the watch
314-
watchCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
349+
watchCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
315350
defer cancel()
316351

317352
// Use SharedInformerFactory for efficient job status monitoring with filtering

log-collector/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ WORKDIR /opt/log-collector
4444
COPY log-collector/entrypoint.sh /opt/log-collector/entrypoint.sh
4545
RUN chmod +x /opt/log-collector/entrypoint.sh
4646

47+
# Copy mock scripts for testing
48+
COPY log-collector/mock-nvidia-bug-report.sh /mock-nvidia-bug-report.sh
49+
COPY log-collector/mock-must-gather.sh /mock-must-gather.sh
50+
RUN chmod +x /mock-nvidia-bug-report.sh /mock-must-gather.sh
51+
4752
ENV PATH="/opt/log-collector:${PATH}"
4853

4954
ENTRYPOINT ["/opt/log-collector/entrypoint.sh"]

log-collector/entrypoint.sh

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,81 @@ MUST_GATHER_SCRIPT_URL="${MUST_GATHER_SCRIPT_URL:-https://raw.githubusercontent.
3030
ENABLE_GCP_SOS_COLLECTION="${ENABLE_GCP_SOS_COLLECTION:-false}"
3131
ENABLE_AWS_SOS_COLLECTION="${ENABLE_AWS_SOS_COLLECTION:-false}"
3232

33+
# Mock mode for testing
34+
MOCK_MODE="${MOCK_MODE:-false}"
35+
MOCK_EXIT_CODE="${MOCK_EXIT_CODE:-0}"
36+
MOCK_SLEEP_DURATION="${MOCK_SLEEP_DURATION:-5}"
37+
3338
mkdir -p "${ARTIFACTS_DIR}"
3439
echo "[INFO] Target node: ${NODE_NAME} | GPU Operator namespace: ${GPU_OPERATOR_NAMESPACE} | Driver container: ${DRIVER_CONTAINER_NAME}"
3540

41+
# Early exit for mock mode - generate synthetic artifacts and skip real collection
42+
if [ "${MOCK_MODE}" = "true" ]; then
43+
echo "[INFO] Mock mode enabled - generating synthetic artifacts"
44+
45+
# Generate mock nvidia-bug-report artifact
46+
BUG_REPORT_LOCAL="${ARTIFACTS_DIR}/nvidia-bug-report-${NODE_NAME}-${TIMESTAMP}.log.gz"
47+
if [ -f "/mock-nvidia-bug-report.sh" ]; then
48+
echo "[INFO] Running mock nvidia-bug-report.sh"
49+
/mock-nvidia-bug-report.sh --output-file "${ARTIFACTS_DIR}/nvidia-bug-report-${NODE_NAME}-${TIMESTAMP}.log"
50+
else
51+
echo "[INFO] Creating synthetic nvidia-bug-report"
52+
echo "Mock nvidia-bug-report output for ${NODE_NAME} at ${TIMESTAMP}" | gzip > "${BUG_REPORT_LOCAL}"
53+
fi
54+
echo "[INFO] Mock bug report created: ${BUG_REPORT_LOCAL}"
55+
56+
# Generate mock GPU Operator must-gather artifact
57+
GPU_MG_DIR="${ARTIFACTS_DIR}/gpu-operator-must-gather"
58+
mkdir -p "${GPU_MG_DIR}"
59+
GPU_MG_TARBALL="${ARTIFACTS_DIR}/gpu-operator-must-gather-${NODE_NAME}-${TIMESTAMP}.tar.gz"
60+
61+
if [ -f "/mock-must-gather.sh" ]; then
62+
echo "[INFO] Running mock must-gather.sh"
63+
cp /mock-must-gather.sh "${GPU_MG_DIR}/must-gather.sh"
64+
chmod +x "${GPU_MG_DIR}/must-gather.sh"
65+
bash "${GPU_MG_DIR}/must-gather.sh" || true
66+
else
67+
echo "[INFO] Creating synthetic must-gather data"
68+
echo "Mock must-gather output for ${NODE_NAME} at ${TIMESTAMP}" > "${GPU_MG_DIR}/mock-data.txt"
69+
fi
70+
71+
tar -C "${GPU_MG_DIR}" -czf "${GPU_MG_TARBALL}" .
72+
echo "[INFO] Mock must-gather tarball created: ${GPU_MG_TARBALL}"
73+
74+
# Optional sleep to simulate collection time
75+
if [ "${MOCK_SLEEP_DURATION}" -gt 0 ]; then
76+
echo "[INFO] Sleeping ${MOCK_SLEEP_DURATION}s to simulate collection"
77+
sleep "${MOCK_SLEEP_DURATION}"
78+
fi
79+
80+
# Upload mock artifacts if upload URL is configured
81+
if [ -n "${UPLOAD_URL_BASE:-}" ]; then
82+
echo "[INFO] Uploading mock artifacts to ${UPLOAD_URL_BASE}/${NODE_NAME}/${TIMESTAMP}"
83+
84+
if [ -f "${BUG_REPORT_LOCAL}" ]; then
85+
if curl -fsS -X PUT --upload-file "${BUG_REPORT_LOCAL}" \
86+
"${UPLOAD_URL_BASE}/${NODE_NAME}/${TIMESTAMP}/$(basename "${BUG_REPORT_LOCAL}")"; then
87+
echo "[UPLOAD_SUCCESS] mock nvidia-bug-report uploaded: $(basename "${BUG_REPORT_LOCAL}")"
88+
else
89+
echo "[UPLOAD_FAILED] Failed to upload mock nvidia-bug-report: $(basename "${BUG_REPORT_LOCAL}")" >&2
90+
fi
91+
fi
92+
93+
if [ -f "${GPU_MG_TARBALL}" ]; then
94+
if curl -fsS -X PUT --upload-file "${GPU_MG_TARBALL}" \
95+
"${UPLOAD_URL_BASE}/${NODE_NAME}/${TIMESTAMP}/$(basename "${GPU_MG_TARBALL}")"; then
96+
echo "[UPLOAD_SUCCESS] mock gpu-operator must-gather uploaded: $(basename "${GPU_MG_TARBALL}")"
97+
else
98+
echo "[UPLOAD_FAILED] Failed to upload mock gpu-operator must-gather: $(basename "${GPU_MG_TARBALL}")" >&2
99+
fi
100+
fi
101+
fi
102+
103+
echo "[INFO] Mock mode complete. Artifacts under ${ARTIFACTS_DIR}"
104+
echo "[INFO] Mock mode: Exiting with code ${MOCK_EXIT_CODE}"
105+
exit "${MOCK_EXIT_CODE}"
106+
fi
107+
36108
# Function to detect if running on GCP using IMDS
37109
is_running_on_gcp() {
38110
local timeout=5
@@ -197,8 +269,10 @@ fi
197269
GPU_MG_DIR="${ARTIFACTS_DIR}/gpu-operator-must-gather"
198270
mkdir -p "${GPU_MG_DIR}"
199271
echo "[INFO] Running GPU Operator must-gather..."
272+
200273
curl -fsSL "${MUST_GATHER_SCRIPT_URL}" -o "${GPU_MG_DIR}/must-gather.sh"
201274
chmod +x "${GPU_MG_DIR}/must-gather.sh"
275+
202276
bash "${GPU_MG_DIR}/must-gather.sh"
203277

204278
GPU_MG_TARBALL="${ARTIFACTS_DIR}/gpu-operator-must-gather-${NODE_NAME}-${TIMESTAMP}.tar.gz"

log-collector/mock-must-gather.sh

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Mock must-gather.sh script for testing
17+
# This script simulates the behavior of GPU Operator must-gather without requiring actual GPU Operator
18+
19+
set -e
20+
21+
OUTPUT_DIR="${PWD}"
22+
OUTPUT_FILE="must-gather.tar.gz"
23+
24+
# Parse command-line arguments
25+
while [[ $# -gt 0 ]]; do
26+
case $1 in
27+
-o|--output)
28+
OUTPUT_DIR="$2"
29+
shift 2
30+
;;
31+
*)
32+
shift
33+
;;
34+
esac
35+
done
36+
37+
echo "[MOCK] Generating GPU Operator must-gather"
38+
39+
# Simulate collection time
40+
if [ -n "${MOCK_SLEEP_DURATION:-}" ] && [ "${MOCK_SLEEP_DURATION}" -gt 0 ]; then
41+
sleep "${MOCK_SLEEP_DURATION}"
42+
fi
43+
44+
# Create mock must-gather content
45+
TEMP_DIR=$(mktemp -d)
46+
mkdir -p "${TEMP_DIR}/gpu-operator-must-gather"
47+
48+
cat > "${TEMP_DIR}/gpu-operator-must-gather/summary.txt" <<EOF
49+
Mock GPU Operator Must-Gather Report
50+
=====================================
51+
Generated: $(date)
52+
Node: ${NODE_NAME:-unknown}
53+
Timestamp: ${TIMESTAMP:-$(date +%s)}
54+
Namespace: ${GPU_OPERATOR_NAMESPACE:-gpu-operator}
55+
56+
GPU Operator Pods:
57+
------------------
58+
- nvidia-driver-daemonset-xxxxx (Mock)
59+
- nvidia-device-plugin-daemonset-xxxxx (Mock)
60+
- gpu-feature-discovery-xxxxx (Mock)
61+
62+
GPU Operator Version: v23.9.0 (Mock)
63+
64+
Mock must-gather completed successfully
65+
EOF
66+
67+
# Create mock pod logs
68+
mkdir -p "${TEMP_DIR}/gpu-operator-must-gather/logs"
69+
echo "Mock driver pod logs - Node: ${NODE_NAME:-unknown}" > "${TEMP_DIR}/gpu-operator-must-gather/logs/nvidia-driver.log"
70+
echo "Mock device plugin logs - Node: ${NODE_NAME:-unknown}" > "${TEMP_DIR}/gpu-operator-must-gather/logs/device-plugin.log"
71+
72+
# Create tarball
73+
cd "${TEMP_DIR}"
74+
tar czf "${OUTPUT_FILE}" gpu-operator-must-gather/
75+
76+
# Move to output directory
77+
mv "${OUTPUT_FILE}" "${OUTPUT_DIR}/"
78+
79+
# Cleanup
80+
cd - > /dev/null
81+
rm -rf "${TEMP_DIR}"
82+
83+
echo "[MOCK] GPU Operator must-gather completed: ${OUTPUT_DIR}/${OUTPUT_FILE}"
84+
exit 0
85+
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Mock nvidia-bug-report.sh script for testing
17+
# This script simulates the behavior of nvidia-bug-report.sh without requiring actual NVIDIA drivers
18+
19+
set -e
20+
21+
OUTPUT_FILE="nvidia-bug-report.log.gz"
22+
23+
# Parse command-line arguments to find output file
24+
while [[ $# -gt 0 ]]; do
25+
case $1 in
26+
--output-file)
27+
OUTPUT_FILE="$2"
28+
shift 2
29+
;;
30+
--output-file=*)
31+
OUTPUT_FILE="${1#*=}"
32+
shift
33+
;;
34+
*)
35+
shift
36+
;;
37+
esac
38+
done
39+
40+
echo "[MOCK] Generating nvidia-bug-report at ${OUTPUT_FILE}"
41+
42+
# Simulate collection time
43+
if [ -n "${MOCK_SLEEP_DURATION:-}" ] && [ "${MOCK_SLEEP_DURATION}" -gt 0 ]; then
44+
sleep "${MOCK_SLEEP_DURATION}"
45+
fi
46+
47+
# Create mock content with realistic structure
48+
cat > /tmp/mock-nvidia-bug-report.log <<EOF
49+
Mock NVIDIA Bug Report
50+
============================
51+
Generated: $(date)
52+
Node: ${NODE_NAME:-unknown}
53+
Timestamp: ${TIMESTAMP:-$(date +%s)}
54+
55+
System Information:
56+
-------------------
57+
Hostname: ${NODE_NAME:-mock-node}
58+
Kernel: $(uname -r)
59+
OS: Mock Linux Distribution
60+
61+
NVIDIA Driver Information:
62+
--------------------------
63+
Driver Version: 550.54.15 (Mock)
64+
CUDA Version: 12.4 (Mock)
65+
66+
GPU Information:
67+
----------------
68+
GPU 0: NVIDIA A100-SXM4-80GB (Mock)
69+
UUID: GPU-12345678-1234-1234-1234-123456789012
70+
Bus ID: 0000:00:04.0
71+
72+
Mock nvidia-bug-report generated successfully
73+
EOF
74+
75+
# Compress the mock report
76+
gzip -c /tmp/mock-nvidia-bug-report.log > "${OUTPUT_FILE}"
77+
rm -f /tmp/mock-nvidia-bug-report.log
78+
79+
echo "[MOCK] nvidia-bug-report completed successfully"
80+
exit 0
81+

tests/helpers/kube.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,6 +1398,24 @@ func PortForwardPod(
13981398
return stopChan, readyChan
13991399
}
14001400

1401+
// GetFileServerPodName returns the name of the file-server pod in the NVSentinel namespace
1402+
func GetFileServerPodName(ctx context.Context, client klient.Client) (string, error) {
1403+
podList := &v1.PodList{}
1404+
1405+
err := client.Resources(NVSentinelNamespace).List(ctx, podList)
1406+
if err != nil {
1407+
return "", fmt.Errorf("failed to list pods: %w", err)
1408+
}
1409+
1410+
for _, pod := range podList.Items {
1411+
if strings.Contains(pod.Name, "file-server") && IsPodReady(pod) {
1412+
return pod.Name, nil
1413+
}
1414+
}
1415+
1416+
return "", fmt.Errorf("file-server pod not found")
1417+
}
1418+
14011419
// WaitForNodeConditionWithCheckName waits for the node to have a condition with the reason as checkName.
14021420
func WaitForNodeConditionWithCheckName(
14031421
ctx context.Context, t *testing.T, c klient.Client, nodeName, checkName, message string,

0 commit comments

Comments
 (0)