Skip to content

Commit 2227a29

Browse files
committed
feat: Add mock mode for log collector with Tilt tests for HIPPO-2120
1 parent 6e84d31 commit 2227a29

File tree

12 files changed

+650
-4
lines changed

12 files changed

+650
-4
lines changed

distros/kubernetes/nvsentinel/charts/fault-remediation/files/log-collector-job.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ spec:
6161
value: {{ .Values.logCollector.enableGcpSosCollection | quote }}
6262
- name: ENABLE_AWS_SOS_COLLECTION
6363
value: {{ .Values.logCollector.enableAwsSosCollection | quote }}
64+
- name: MOCK_MODE
65+
value: {{ .Values.logCollector.mockMode | quote }}
66+
- name: MOCK_EXIT_CODE
67+
value: {{ .Values.logCollector.mockExitCode | quote }}
68+
- name: MOCK_SLEEP_DURATION
69+
value: {{ .Values.logCollector.mockSleepDuration | quote }}
6470
volumeMounts:
6571
- name: artifacts
6672
mountPath: /artifacts

distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,10 @@ logCollector:
105105
enableGcpSosCollection: false
106106
# Enable AWS-specific SOS report collection
107107
enableAwsSosCollection: false
108+
# Mock mode for testing
109+
# Values: "true" (enable mock mode), "false" (disable mock mode)
110+
mockMode: false
111+
# Exit code to use in mock mode (0 for success, non-zero for failure scenarios)
112+
mockExitCode: 0
113+
# Sleep duration in seconds during mock mode to simulate collection time
114+
mockSleepDuration: 5

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@ fault-quarantine:
121121
fault-remediation:
122122
logLevel: debug
123123

124+
logCollector:
125+
enabled: true
126+
mockMode: "true"
127+
mockExitCode: "0"
128+
mockSleepDuration: "5"
129+
124130
affinity:
125131
podAntiAffinity:
126132
requiredDuringSchedulingIgnoredDuringExecution:

fault-remediation/pkg/reconciler/remediation.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ func (c *FaultRemediationClient) RunLogCollectorJob(ctx context.Context, nodeNam
311311
log.Printf("Waiting for log collector job %s to complete", created.Name)
312312

313313
// Use a context with timeout for the watch
314-
watchCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
314+
watchCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
315315
defer cancel()
316316

317317
// Use SharedInformerFactory for efficient job status monitoring with filtering

log-collector/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ WORKDIR /opt/log-collector
4444
COPY log-collector/entrypoint.sh /opt/log-collector/entrypoint.sh
4545
RUN chmod +x /opt/log-collector/entrypoint.sh
4646

47+
# Copy mock scripts for testing
48+
COPY log-collector/mock-nvidia-bug-report.sh /mock-nvidia-bug-report.sh
49+
COPY log-collector/mock-must-gather.sh /mock-must-gather.sh
50+
RUN chmod +x /mock-nvidia-bug-report.sh /mock-must-gather.sh
51+
4752
ENV PATH="/opt/log-collector:${PATH}"
4853

4954
ENTRYPOINT ["/opt/log-collector/entrypoint.sh"]

log-collector/entrypoint.sh

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,32 @@ MUST_GATHER_SCRIPT_URL="${MUST_GATHER_SCRIPT_URL:-https://raw.githubusercontent.
3030
ENABLE_GCP_SOS_COLLECTION="${ENABLE_GCP_SOS_COLLECTION:-false}"
3131
ENABLE_AWS_SOS_COLLECTION="${ENABLE_AWS_SOS_COLLECTION:-false}"
3232

33+
# Mock mode for testing
34+
MOCK_MODE="${MOCK_MODE:-false}"
35+
MOCK_EXIT_CODE="${MOCK_EXIT_CODE:-0}"
36+
MOCK_SLEEP_DURATION="${MOCK_SLEEP_DURATION:-5}"
37+
38+
# If mock mode is enabled, set up mock tools directory and prepend to PATH
39+
# This approach allows the real production flow to run while using fake tools
40+
if [ "${MOCK_MODE}" = "true" ]; then
41+
echo "[INFO] Mock mode enabled - setting up mock tools"
42+
43+
MOCK_TOOLS_DIR="/tmp/mock-tools"
44+
mkdir -p "${MOCK_TOOLS_DIR}"
45+
46+
# Symlink mock nvidia-bug-report.sh
47+
if [ -f "/mock-nvidia-bug-report.sh" ]; then
48+
ln -sf /mock-nvidia-bug-report.sh "${MOCK_TOOLS_DIR}/nvidia-bug-report.sh"
49+
chmod +x "${MOCK_TOOLS_DIR}/nvidia-bug-report.sh"
50+
fi
51+
52+
# Prepend mock tools directory to PATH so our mocks are used first
53+
export PATH="${MOCK_TOOLS_DIR}:${PATH}"
54+
55+
echo "[INFO] Mock tools setup complete. PATH: ${PATH}"
56+
echo "[INFO] Mock exit code will be: ${MOCK_EXIT_CODE}"
57+
fi
58+
3359
mkdir -p "${ARTIFACTS_DIR}"
3460
echo "[INFO] Target node: ${NODE_NAME} | GPU Operator namespace: ${GPU_OPERATOR_NAMESPACE} | Driver container: ${DRIVER_CONTAINER_NAME}"
3561

@@ -197,8 +223,17 @@ fi
197223
GPU_MG_DIR="${ARTIFACTS_DIR}/gpu-operator-must-gather"
198224
mkdir -p "${GPU_MG_DIR}"
199225
echo "[INFO] Running GPU Operator must-gather..."
200-
curl -fsSL "${MUST_GATHER_SCRIPT_URL}" -o "${GPU_MG_DIR}/must-gather.sh"
201-
chmod +x "${GPU_MG_DIR}/must-gather.sh"
226+
227+
# In mock mode, use the mock script directly; otherwise download from URL
228+
if [ "${MOCK_MODE}" = "true" ] && [ -f "/mock-must-gather.sh" ]; then
229+
cp /mock-must-gather.sh "${GPU_MG_DIR}/must-gather.sh"
230+
chmod +x "${GPU_MG_DIR}/must-gather.sh"
231+
echo "[INFO] Using mock must-gather script"
232+
else
233+
curl -fsSL "${MUST_GATHER_SCRIPT_URL}" -o "${GPU_MG_DIR}/must-gather.sh"
234+
chmod +x "${GPU_MG_DIR}/must-gather.sh"
235+
fi
236+
202237
bash "${GPU_MG_DIR}/must-gather.sh"
203238

204239
GPU_MG_TARBALL="${ARTIFACTS_DIR}/gpu-operator-must-gather-${NODE_NAME}-${TIMESTAMP}.tar.gz"
@@ -250,4 +285,10 @@ if [ -n "${UPLOAD_URL_BASE:-}" ]; then
250285
fi
251286
fi
252287

253-
echo "[INFO] Done. Artifacts under ${ARTIFACTS_DIR}"
288+
echo "[INFO] Done. Artifacts under ${ARTIFACTS_DIR}"
289+
290+
# If mock mode is enabled, exit with the configured mock exit code
291+
if [ "${MOCK_MODE}" = "true" ]; then
292+
echo "[INFO] Mock mode: Exiting with code ${MOCK_EXIT_CODE}"
293+
exit "${MOCK_EXIT_CODE}"
294+
fi

log-collector/mock-must-gather.sh

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Mock must-gather.sh script for testing
17+
# This script simulates the behavior of GPU Operator must-gather without requiring actual GPU Operator
18+
19+
set -e
20+
21+
OUTPUT_DIR="${PWD}"
22+
OUTPUT_FILE="must-gather.tar.gz"
23+
24+
# Parse command-line arguments
25+
while [[ $# -gt 0 ]]; do
26+
case $1 in
27+
-o|--output)
28+
OUTPUT_DIR="$2"
29+
shift 2
30+
;;
31+
*)
32+
shift
33+
;;
34+
esac
35+
done
36+
37+
echo "[MOCK] Generating GPU Operator must-gather"
38+
39+
# Simulate collection time
40+
if [ -n "${MOCK_SLEEP_DURATION:-}" ] && [ "${MOCK_SLEEP_DURATION}" -gt 0 ]; then
41+
sleep "${MOCK_SLEEP_DURATION}"
42+
fi
43+
44+
# Create mock must-gather content
45+
TEMP_DIR=$(mktemp -d)
46+
mkdir -p "${TEMP_DIR}/gpu-operator-must-gather"
47+
48+
cat > "${TEMP_DIR}/gpu-operator-must-gather/summary.txt" <<EOF
49+
Mock GPU Operator Must-Gather Report
50+
=====================================
51+
Generated: $(date)
52+
Node: ${NODE_NAME:-unknown}
53+
Timestamp: ${TIMESTAMP:-$(date +%s)}
54+
Namespace: ${GPU_OPERATOR_NAMESPACE:-gpu-operator}
55+
56+
GPU Operator Pods:
57+
------------------
58+
- nvidia-driver-daemonset-xxxxx (Mock)
59+
- nvidia-device-plugin-daemonset-xxxxx (Mock)
60+
- gpu-feature-discovery-xxxxx (Mock)
61+
62+
GPU Operator Version: v23.9.0 (Mock)
63+
64+
Mock must-gather completed successfully
65+
EOF
66+
67+
# Create mock pod logs
68+
mkdir -p "${TEMP_DIR}/gpu-operator-must-gather/logs"
69+
echo "Mock driver pod logs - Node: ${NODE_NAME:-unknown}" > "${TEMP_DIR}/gpu-operator-must-gather/logs/nvidia-driver.log"
70+
echo "Mock device plugin logs - Node: ${NODE_NAME:-unknown}" > "${TEMP_DIR}/gpu-operator-must-gather/logs/device-plugin.log"
71+
72+
# Create tarball
73+
cd "${TEMP_DIR}"
74+
tar czf "${OUTPUT_FILE}" gpu-operator-must-gather/
75+
76+
# Move to output directory
77+
mv "${OUTPUT_FILE}" "${OUTPUT_DIR}/"
78+
79+
# Cleanup
80+
cd - > /dev/null
81+
rm -rf "${TEMP_DIR}"
82+
83+
echo "[MOCK] GPU Operator must-gather completed: ${OUTPUT_DIR}/${OUTPUT_FILE}"
84+
exit 0
85+
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Mock nvidia-bug-report.sh script for testing
17+
# This script simulates the behavior of nvidia-bug-report.sh without requiring actual NVIDIA drivers
18+
19+
set -e
20+
21+
OUTPUT_FILE="nvidia-bug-report.log.gz"
22+
23+
# Parse command-line arguments to find output file
24+
while [[ $# -gt 0 ]]; do
25+
case $1 in
26+
--output-file)
27+
OUTPUT_FILE="$2"
28+
shift 2
29+
;;
30+
--output-file=*)
31+
OUTPUT_FILE="${1#*=}"
32+
shift
33+
;;
34+
*)
35+
shift
36+
;;
37+
esac
38+
done
39+
40+
echo "[MOCK] Generating nvidia-bug-report at ${OUTPUT_FILE}"
41+
42+
# Simulate collection time
43+
if [ -n "${MOCK_SLEEP_DURATION:-}" ] && [ "${MOCK_SLEEP_DURATION}" -gt 0 ]; then
44+
sleep "${MOCK_SLEEP_DURATION}"
45+
fi
46+
47+
# Create mock content with realistic structure
48+
cat > /tmp/mock-nvidia-bug-report.log <<EOF
49+
Mock NVIDIA Bug Report
50+
============================
51+
Generated: $(date)
52+
Node: ${NODE_NAME:-unknown}
53+
Timestamp: ${TIMESTAMP:-$(date +%s)}
54+
55+
System Information:
56+
-------------------
57+
Hostname: ${NODE_NAME:-mock-node}
58+
Kernel: $(uname -r)
59+
OS: Mock Linux Distribution
60+
61+
NVIDIA Driver Information:
62+
--------------------------
63+
Driver Version: 550.54.15 (Mock)
64+
CUDA Version: 12.4 (Mock)
65+
66+
GPU Information:
67+
----------------
68+
GPU 0: NVIDIA A100-SXM4-80GB (Mock)
69+
UUID: GPU-12345678-1234-1234-1234-123456789012
70+
Bus ID: 0000:00:04.0
71+
72+
Mock nvidia-bug-report generated successfully
73+
EOF
74+
75+
# Compress the mock report
76+
gzip -c /tmp/mock-nvidia-bug-report.log > "${OUTPUT_FILE}"
77+
rm -f /tmp/mock-nvidia-bug-report.log
78+
79+
echo "[MOCK] nvidia-bug-report completed successfully"
80+
exit 0
81+

tests/helpers/kube.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,6 +1398,24 @@ func PortForwardPod(
13981398
return stopChan, readyChan
13991399
}
14001400

1401+
// GetFileServerPodName returns the name of the file-server pod in the NVSentinel namespace
1402+
func GetFileServerPodName(ctx context.Context, client klient.Client) (string, error) {
1403+
podList := &v1.PodList{}
1404+
1405+
err := client.Resources(NVSentinelNamespace).List(ctx, podList)
1406+
if err != nil {
1407+
return "", fmt.Errorf("failed to list pods: %w", err)
1408+
}
1409+
1410+
for _, pod := range podList.Items {
1411+
if strings.Contains(pod.Name, "file-server") && IsPodReady(pod) {
1412+
return pod.Name, nil
1413+
}
1414+
}
1415+
1416+
return "", fmt.Errorf("file-server pod not found")
1417+
}
1418+
14011419
// WaitForNodeConditionWithCheckName waits for the node to have a condition with the reason as checkName.
14021420
func WaitForNodeConditionWithCheckName(
14031421
ctx context.Context, t *testing.T, c klient.Client, nodeName, checkName, message string,

0 commit comments

Comments
 (0)