Skip to content

Commit bf1c2bd

Browse files
committed
feat: log collector tilt test
1 parent 1584015 commit bf1c2bd

File tree

13 files changed

+574
-122
lines changed

13 files changed

+574
-122
lines changed

distros/kubernetes/nvsentinel/charts/fault-remediation/files/log-collector-job.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ spec:
3636
{{- end }}
3737
containers:
3838
- name: log-collector
39-
image: {{ .Values.logCollector.image.repository }}:{{ .Values.global.image.tag | default "latest" }}
39+
image: {{ .Values.logCollector.image.repository }}:{{ .Values.logCollector.image.tag | default .Values.global.image.tag | default "latest" }}
4040
imagePullPolicy: {{ .Values.logCollector.image.pullPolicy }}
4141
securityContext:
4242
privileged: true

distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ logCollector:
9696
enabled: false
9797
image:
9898
repository: ghcr.io/nvidia/nvsentinel/log-collector
99+
# tag: latest # Optional: Override global.image.tag for log-collector
99100
pullPolicy: IfNotPresent
100101
# HTTP endpoint where collected logs will be uploaded
101102
uploadURL: "http://nvsentinel-incluster-file-server.nvsentinel.svc.cluster.local/upload"

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,17 @@ fault-quarantine:
124124
fault-remediation:
125125
logLevel: debug
126126

127+
logCollector:
128+
enabled: true
129+
image:
130+
repository: localhost:5001/ghcr.io_nvidia_nvsentinel_log-collector
131+
tag: latest # Use latest tag built by Tilt
132+
pullPolicy: Always # Always pull latest Tilt-built image
133+
env:
134+
MOCK_MODE: "true"
135+
MOCK_EXIT_CODE: "0"
136+
MOCK_SLEEP_DURATION: "2"
137+
127138
affinity:
128139
podAntiAffinity:
129140
requiredDuringSchedulingIgnoredDuringExecution:

log-collector/Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ RUN useradd -u 10001 -m nvsentinel
4242
WORKDIR /opt/log-collector
4343

4444
COPY log-collector/entrypoint.sh /opt/log-collector/entrypoint.sh
45-
RUN chmod +x /opt/log-collector/entrypoint.sh
45+
COPY log-collector/mock-nvidia-bug-report.sh /opt/log-collector/mock-nvidia-bug-report.sh
46+
COPY log-collector/mock-must-gather.sh /opt/log-collector/mock-must-gather.sh
47+
RUN chmod +x /opt/log-collector/entrypoint.sh /opt/log-collector/mock-nvidia-bug-report.sh /opt/log-collector/mock-must-gather.sh
48+
4649

4750
ENV PATH="/opt/log-collector:${PATH}"
4851

log-collector/Tiltfile

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
docker_build(
16-
"ghcr.io/nvidia/nvsentinel/log-collector",
17-
context=".",
18-
dockerfile="./Dockerfile"
15+
# Build docker image and push to local registry with fixed tag
16+
# Using local_resource since the image is used in dynamically created jobs
17+
# Build from repo root since Dockerfile expects that context
18+
local_resource(
19+
"log-collector",
20+
"cd .. && docker build -t localhost:5001/ghcr.io_nvidia_nvsentinel_log-collector:latest -f log-collector/Dockerfile . && docker push localhost:5001/ghcr.io_nvidia_nvsentinel_log-collector:latest",
21+
deps=["./"],
22+
ignore=["./Tiltfile"],
23+
labels=["images"]
1924
)

log-collector/entrypoint.sh

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,28 @@ MUST_GATHER_SCRIPT_URL="${MUST_GATHER_SCRIPT_URL:-https://raw.githubusercontent.
3030
ENABLE_GCP_SOS_COLLECTION="${ENABLE_GCP_SOS_COLLECTION:-false}"
3131
ENABLE_AWS_SOS_COLLECTION="${ENABLE_AWS_SOS_COLLECTION:-false}"
3232

33+
# Mock mode for testing - prepends mock scripts to PATH
34+
MOCK_MODE="${MOCK_MODE:-false}"
35+
MOCK_EXIT_CODE="${MOCK_EXIT_CODE:-0}"
36+
37+
if [ "${MOCK_MODE}" = "true" ]; then
38+
echo "[MOCK] Enabling mock mode - using mock nvidia-bug-report.sh and must-gather.sh"
39+
MOCK_SCRIPTS_DIR="/opt/log-collector"
40+
41+
# Copy mock scripts and make them executable
42+
cp "${MOCK_SCRIPTS_DIR}/mock-nvidia-bug-report.sh" "${MOCK_SCRIPTS_DIR}/nvidia-bug-report.sh"
43+
cp "${MOCK_SCRIPTS_DIR}/mock-must-gather.sh" "${MOCK_SCRIPTS_DIR}/must-gather.sh"
44+
chmod +x "${MOCK_SCRIPTS_DIR}/nvidia-bug-report.sh" "${MOCK_SCRIPTS_DIR}/must-gather.sh"
45+
46+
# Prepend to PATH so mocks are used instead of real tools
47+
export PATH="${MOCK_SCRIPTS_DIR}:${PATH}"
48+
49+
# Override MUST_GATHER_SCRIPT_URL to use local mock instead of downloading
50+
MUST_GATHER_SCRIPT_URL="file://${MOCK_SCRIPTS_DIR}/must-gather.sh"
51+
52+
echo "[MOCK] Mock mode enabled. nvidia-bug-report.sh and must-gather.sh will use mock versions."
53+
fi
54+
3355
mkdir -p "${ARTIFACTS_DIR}"
3456
echo "[INFO] Target node: ${NODE_NAME} | GPU Operator namespace: ${GPU_OPERATOR_NAMESPACE} | Driver container: ${DRIVER_CONTAINER_NAME}"
3557

@@ -77,8 +99,16 @@ AWS_SOS_REPORT=""
7799
GCP_NVIDIA_BUG_REPORT="/host/home/kubernetes/bin/nvidia/bin/nvidia-bug-report.sh"
78100

79101
# 1) Collect nvidia-bug-report - auto-detect approach
102+
# In mock mode, use the local mock script directly
103+
if [ "${MOCK_MODE}" = "true" ]; then
104+
echo "[MOCK] Using local mock nvidia-bug-report.sh"
105+
BUG_REPORT_LOCAL_BASE="${ARTIFACTS_DIR}/nvidia-bug-report-${NODE_NAME}-${TIMESTAMP}"
106+
BUG_REPORT_LOCAL="${BUG_REPORT_LOCAL_BASE}.log.gz"
107+
nvidia-bug-report.sh --output-file "${BUG_REPORT_LOCAL_BASE}.log"
108+
echo "[MOCK] Bug report saved to ${BUG_REPORT_LOCAL}"
109+
80110
# Check if GCP COS nvidia-bug-report exists on the host filesystem (accessed via privileged container)
81-
if [ -f "${GCP_NVIDIA_BUG_REPORT}" ]; then
111+
elif [ -f "${GCP_NVIDIA_BUG_REPORT}" ]; then
82112
echo "[INFO] Found nvidia-bug-report at GCP COS location: ${GCP_NVIDIA_BUG_REPORT}"
83113

84114
# Use GCP COS approach - write directly to container filesystem
@@ -250,4 +280,10 @@ if [ -n "${UPLOAD_URL_BASE:-}" ]; then
250280
fi
251281
fi
252282

253-
echo "[INFO] Done. Artifacts under ${ARTIFACTS_DIR}"
283+
echo "[INFO] Done. Artifacts under ${ARTIFACTS_DIR}"
284+
285+
# Exit with MOCK_EXIT_CODE if in mock mode (for testing failure scenarios)
286+
if [ "${MOCK_MODE}" = "true" ]; then
287+
echo "[MOCK] Exiting with code: ${MOCK_EXIT_CODE}"
288+
exit "${MOCK_EXIT_CODE}"
289+
fi

log-collector/mock-must-gather.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
# Mock GPU Operator must-gather for testing
3+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
4+
5+
set -e
6+
7+
echo "[MOCK] GPU Operator must-gather called"
8+
echo "[MOCK] Collecting mock diagnostic data..."
9+
10+
# Create mock must-gather directory structure
11+
mkdir -p ./namespaces ./logs ./cluster-info
12+
13+
# Generate simple mock files
14+
cat > ./cluster-info/info.txt <<EOF
15+
Mock GPU Operator Must-Gather
16+
Generated: $(date)
17+
Node: ${NODE_NAME:-unknown}
18+
Mock Mode: Enabled
19+
EOF
20+
21+
echo "Mock pod logs - $(date)" > ./logs/mock-pod.log
22+
echo "Mock namespace data" > ./namespaces/mock-ns.yaml
23+
24+
echo "[MOCK] Must-gather complete"
25+
exit 0
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
# Mock nvidia-bug-report.sh for testing
3+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
4+
5+
set -e
6+
7+
echo "[MOCK] nvidia-bug-report.sh called"
8+
9+
# Parse arguments to find output file
10+
OUTPUT_FILE=""
11+
while [[ $# -gt 0 ]]; do
12+
case $1 in
13+
--output-file)
14+
OUTPUT_FILE="$2"
15+
shift 2
16+
;;
17+
*)
18+
shift
19+
;;
20+
esac
21+
done
22+
23+
if [ -z "$OUTPUT_FILE" ]; then
24+
echo "[MOCK ERROR] No output file specified" >&2
25+
exit 1
26+
fi
27+
28+
echo "[MOCK] Generating mock nvidia-bug-report to: ${OUTPUT_FILE}"
29+
30+
# Create a simple mock bug report and gzip it
31+
# Real nvidia-bug-report.sh automatically appends .gz to the output file
32+
# We mimic this behavior
33+
echo "Mock NVIDIA Bug Report - Node: ${NODE_NAME:-unknown}, Date: $(date)" | gzip > "${OUTPUT_FILE}.gz"
34+
35+
echo "[MOCK] Mock nvidia-bug-report created successfully at ${OUTPUT_FILE}.gz"
36+
exit 0
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# KWOK Stage to simulate log-collector pod failure for negative testing
2+
# ISOLATION: Uses node name selector - only affects pods on specific test node
3+
# The test will dynamically create a Stage with the actual node name
4+
apiVersion: kwok.x-k8s.io/v1alpha1
5+
kind: Stage
6+
metadata:
7+
name: log-collector-failure
8+
spec:
9+
resourceRef:
10+
apiGroup: v1
11+
kind: Pod
12+
selector:
13+
matchLabels:
14+
app: log-collector
15+
matchExpressions:
16+
- key: .spec.nodeName
17+
operator: In
18+
values:
19+
- "NODE_NAME_PLACEHOLDER" # Will be replaced with actual test node name
20+
delay:
21+
durationMilliseconds: 2000
22+
next:
23+
statusTemplate: |
24+
{{ $now := Now }}
25+
conditions:
26+
- lastProbeTime: null
27+
lastTransitionTime: {{ $now }}
28+
message: "Simulated log-collector failure for testing"
29+
reason: Error
30+
status: "False"
31+
type: ContainersReady
32+
- lastProbeTime: null
33+
lastTransitionTime: {{ $now }}
34+
message: "Simulated log-collector failure for testing"
35+
reason: Error
36+
status: "False"
37+
type: Ready
38+
containerStatuses:
39+
- image: {{ index .spec.containers 0 "image" }}
40+
name: {{ index .spec.containers 0 "name" }}
41+
ready: false
42+
restartCount: 0
43+
started: false
44+
state:
45+
terminated:
46+
exitCode: 1
47+
finishedAt: {{ $now }}
48+
reason: Error
49+
message: "Simulated failure for log-collector testing"
50+
startedAt: {{ $now }}
51+
phase: Failed
52+

0 commit comments

Comments
 (0)