Skip to content

Commit 3b9946b

Browse files
authored
feat: Add AWS SOS report collection to log collector (#216)
1 parent 62fffb5 commit 3b9946b

File tree

5 files changed

+64
-6
lines changed

5 files changed

+64
-6
lines changed

distros/kubernetes/nvsentinel/charts/fault-remediation/files/log-collector-job.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ spec:
3030
serviceAccountName: log-collector-job
3131

3232
restartPolicy: Never
33-
# nodeName will be set programmatically by the controller before create
3433
{{- with .Values.tolerations }}
3534
tolerations:
3635
{{- toYaml . | nindent 8 }}
@@ -60,12 +59,14 @@ spec:
6059
value: {{ .Values.logCollector.uploadURL | quote }}
6160
- name: ENABLE_GCP_SOS_COLLECTION
6261
value: {{ .Values.logCollector.enableGcpSosCollection | quote }}
62+
- name: ENABLE_AWS_SOS_COLLECTION
63+
value: {{ .Values.logCollector.enableAwsSosCollection | quote }}
6364
volumeMounts:
6465
- name: artifacts
6566
mountPath: /artifacts
6667
- name: host-root
6768
mountPath: /host
68-
readOnly: true
69+
readOnly: false
6970
volumes:
7071
- name: artifacts
7172
emptyDir: {}

distros/kubernetes/nvsentinel/charts/fault-remediation/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ spec:
9797
value: "{{ .Values.logCollector.enableGcpSosCollection }}"
9898
- name: LOG_LEVEL
9999
value: "{{ .Values.logLevel }}"
100+
- name: ENABLE_AWS_SOS_COLLECTION
101+
value: "{{ .Values.logCollector.enableAwsSosCollection }}"
100102
envFrom:
101103
- configMapRef:
102104
name: mongodb-config

distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ logCollector:
7171
image:
7272
repository: ghcr.io/nvidia/nvsentinel/log-collector
7373
pullPolicy: IfNotPresent
74-
uploadURL: "http://nvsentinel-file-server.nvsentinel.svc.cluster.local/upload"
74+
uploadURL: "http://nvsentinel-incluster-file-server.nvsentinel.svc.cluster.local/upload"
7575
gpuOperatorNamespaces: "gpu-operator"
7676
enableGcpSosCollection: false
77+
enableAwsSosCollection: false

distros/kubernetes/nvsentinel/charts/incluster-file-server/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,5 @@ metrics:
6464
cleanupMetrics:
6565
image:
6666
repository: ghcr.io/nvidia/nvsentinel/file-server-cleanup
67-
tag: "" # Uses global.image.tag by default
67+
tag: ""
6868
pullPolicy: IfNotPresent

nvsentinel-log-collector/entrypoint.sh

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ GPU_OPERATOR_NAMESPACE="${GPU_OPERATOR_NAMESPACE:-gpu-operator}"
2828
DRIVER_CONTAINER_NAME="${DRIVER_CONTAINER_NAME:-nvidia-driver-ctr}"
2929
MUST_GATHER_SCRIPT_URL="${MUST_GATHER_SCRIPT_URL:-https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh}"
3030
ENABLE_GCP_SOS_COLLECTION="${ENABLE_GCP_SOS_COLLECTION:-false}"
31+
ENABLE_AWS_SOS_COLLECTION="${ENABLE_AWS_SOS_COLLECTION:-false}"
3132

3233
mkdir -p "${ARTIFACTS_DIR}"
3334
echo "[INFO] Target node: ${NODE_NAME} | GPU Operator namespace: ${GPU_OPERATOR_NAMESPACE} | Driver container: ${DRIVER_CONTAINER_NAME}"
@@ -43,8 +44,30 @@ is_running_on_gcp() {
4344
fi
4445
}
4546

46-
# Auto-detect nvidia-bug-report approach and collect GCP SOS if needed
47+
# Function to detect if running on AWS using IMDSv2 only
48+
is_running_on_aws() {
49+
local timeout=5
50+
51+
# Get IMDSv2 session token
52+
local token
53+
token=$(curl -s -m "${timeout}" -X PUT \
54+
"http://169.254.169.254/latest/api/token" \
55+
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null)
56+
57+
if [ -n "${token}" ]; then
58+
# Use IMDSv2 with token to check metadata availability
59+
if curl -s -m "${timeout}" -H "X-aws-ec2-metadata-token: ${token}" \
60+
"http://169.254.169.254/latest/meta-data/" >/dev/null 2>&1; then
61+
return 0
62+
fi
63+
fi
64+
65+
return 1
66+
}
67+
68+
# Auto-detect nvidia-bug-report approach and collect SOS reports if needed
4769
GCP_SOS_REPORT=""
70+
AWS_SOS_REPORT=""
4871
# Access host filesystem directly through privileged container
4972
GCP_NVIDIA_BUG_REPORT="/host/home/kubernetes/bin/nvidia/bin/nvidia-bug-report.sh"
5073

@@ -130,7 +153,33 @@ else
130153
echo "[INFO] SOS collection is disabled or not applicable for this environment"
131154
fi
132155

133-
# 2) GPU Operator must-gather (for all clusters)
156+
# 3) Collect AWS SOS report if on AWS and enabled
157+
if is_running_on_aws && [ "${ENABLE_AWS_SOS_COLLECTION}" = "true" ]; then
158+
echo "[INFO] Collecting AWS SOS report..."
159+
160+
# Generate a unique identifier for this SOS report
161+
SOS_UNIQUE_ID="nvsentinel-$(date +%s)-$$"
162+
163+
if chroot /host bash -c "sos report --batch --tmp-dir=/var/tmp --name=${SOS_UNIQUE_ID}"; then
164+
# Find the SOS report with our unique identifier (exclude .sha256 checksum files)
165+
# Note: sos report prepends hostname, so pattern is: sosreport-<hostname>-<our-unique-id>-<date>-<random>.tar.*
166+
AWS_SOS_REPORT_PATH=$(find /host/var/tmp -name "sosreport-*-${SOS_UNIQUE_ID}-*.tar.*" -not -name "*.sha256" 2>/dev/null | head -1)
167+
168+
if [ -n "${AWS_SOS_REPORT_PATH}" ] && [ -f "${AWS_SOS_REPORT_PATH}" ]; then
169+
AWS_SOS_REPORT="${ARTIFACTS_DIR}/$(basename "${AWS_SOS_REPORT_PATH}")"
170+
cp "${AWS_SOS_REPORT_PATH}" "${AWS_SOS_REPORT}" && echo "[INFO] AWS SOS report saved to ${AWS_SOS_REPORT}"
171+
else
172+
echo "[WARN] SOS report generated but file with unique ID ${SOS_UNIQUE_ID} not found"
173+
fi
174+
else
175+
echo "[WARN] SOS report collection failed - sos may not be installed on host"
176+
fi
177+
178+
elif [ "${ENABLE_AWS_SOS_COLLECTION}" = "true" ]; then
179+
echo "[INFO] AWS SOS collection enabled but not on AWS - skipping"
180+
fi
181+
182+
# 4) GPU Operator must-gather (for all clusters)
134183
GPU_MG_DIR="${ARTIFACTS_DIR}/gpu-operator-must-gather"
135184
mkdir -p "${GPU_MG_DIR}"
136185
echo "[INFO] Running GPU Operator must-gather..."
@@ -164,6 +213,11 @@ if [ -n "${UPLOAD_URL_BASE:-}" ]; then
164213
"${UPLOAD_URL_BASE}/${NODE_NAME}/${TIMESTAMP}/$(basename "${GCP_SOS_REPORT}")" || true
165214
echo "[UPLOAD_SUCCESS] GCP SOS report uploaded: $(basename "${GCP_SOS_REPORT}")"
166215
fi
216+
if [ -n "${AWS_SOS_REPORT}" ] && [ -f "${AWS_SOS_REPORT}" ]; then
217+
curl -fsS -X PUT --upload-file "${AWS_SOS_REPORT}" \
218+
"${UPLOAD_URL_BASE}/${NODE_NAME}/${TIMESTAMP}/$(basename "${AWS_SOS_REPORT}")" || true
219+
echo "[UPLOAD_SUCCESS] AWS SOS report uploaded: $(basename "${AWS_SOS_REPORT}")"
220+
fi
167221
fi
168222

169223
echo "[INFO] Done. Artifacts under ${ARTIFACTS_DIR}"

0 commit comments

Comments
 (0)