@@ -28,6 +28,7 @@ GPU_OPERATOR_NAMESPACE="${GPU_OPERATOR_NAMESPACE:-gpu-operator}"
2828DRIVER_CONTAINER_NAME=" ${DRIVER_CONTAINER_NAME:- nvidia-driver-ctr} "
2929MUST_GATHER_SCRIPT_URL=" ${MUST_GATHER_SCRIPT_URL:- https:// raw.githubusercontent.com/ NVIDIA/ gpu-operator/ main/ hack/ must-gather.sh} "
3030ENABLE_GCP_SOS_COLLECTION=" ${ENABLE_GCP_SOS_COLLECTION:- false} "
31+ ENABLE_AWS_SOS_COLLECTION=" ${ENABLE_AWS_SOS_COLLECTION:- false} "
3132
3233mkdir -p " ${ARTIFACTS_DIR} "
3334echo " [INFO] Target node: ${NODE_NAME} | GPU Operator namespace: ${GPU_OPERATOR_NAMESPACE} | Driver container: ${DRIVER_CONTAINER_NAME} "
@@ -43,8 +44,30 @@ is_running_on_gcp() {
4344 fi
4445}
4546
46- # Auto-detect nvidia-bug-report approach and collect GCP SOS if needed
47+ # Function to detect if running on AWS using IMDSv2 only
48+ is_running_on_aws () {
49+ local timeout=5
50+
51+ # Get IMDSv2 session token
52+ local token
53+ token=$( curl -s -m " ${timeout} " -X PUT \
54+ " http://169.254.169.254/latest/api/token" \
55+ -H " X-aws-ec2-metadata-token-ttl-seconds: 21600" 2> /dev/null)
56+
57+ if [ -n " ${token} " ]; then
58+ # Use IMDSv2 with token to check metadata availability
59+ if curl -s -m " ${timeout} " -H " X-aws-ec2-metadata-token: ${token} " \
60+ " http://169.254.169.254/latest/meta-data/" > /dev/null 2>&1 ; then
61+ return 0
62+ fi
63+ fi
64+
65+ return 1
66+ }
67+
68+ # Auto-detect nvidia-bug-report approach and collect SOS reports if needed
4769GCP_SOS_REPORT=" "
70+ AWS_SOS_REPORT=" "
4871# Access host filesystem directly through privileged container
4972GCP_NVIDIA_BUG_REPORT=" /host/home/kubernetes/bin/nvidia/bin/nvidia-bug-report.sh"
5073
@@ -130,7 +153,33 @@ else
130153 echo " [INFO] SOS collection is disabled or not applicable for this environment"
131154fi
132155
133- # 2) GPU Operator must-gather (for all clusters)
156+ # 3) Collect AWS SOS report if on AWS and enabled
157+ if is_running_on_aws && [ " ${ENABLE_AWS_SOS_COLLECTION} " = " true" ]; then
158+ echo " [INFO] Collecting AWS SOS report..."
159+
160+ # Generate a unique identifier for this SOS report
161+ SOS_UNIQUE_ID=" nvsentinel-$( date +%s) -$$ "
162+
163+ if chroot /host bash -c " sos report --batch --tmp-dir=/var/tmp --name=${SOS_UNIQUE_ID} " ; then
164+ # Find the SOS report with our unique identifier (exclude .sha256 checksum files)
165+ # Note: sos report prepends hostname, so pattern is: sosreport-<hostname>-<our-unique-id>-<date>-<random>.tar.*
166+ AWS_SOS_REPORT_PATH=$( find /host/var/tmp -name " sosreport-*-${SOS_UNIQUE_ID} -*.tar.*" -not -name " *.sha256" 2> /dev/null | head -1)
167+
168+ if [ -n " ${AWS_SOS_REPORT_PATH} " ] && [ -f " ${AWS_SOS_REPORT_PATH} " ]; then
169+ AWS_SOS_REPORT=" ${ARTIFACTS_DIR} /$( basename " ${AWS_SOS_REPORT_PATH} " ) "
170+ cp " ${AWS_SOS_REPORT_PATH} " " ${AWS_SOS_REPORT} " && echo " [INFO] AWS SOS report saved to ${AWS_SOS_REPORT} "
171+ else
172+ echo " [WARN] SOS report generated but file with unique ID ${SOS_UNIQUE_ID} not found"
173+ fi
174+ else
175+ echo " [WARN] SOS report collection failed - sos may not be installed on host"
176+ fi
177+
178+ elif [ " ${ENABLE_AWS_SOS_COLLECTION} " = " true" ]; then
179+ echo " [INFO] AWS SOS collection enabled but not on AWS - skipping"
180+ fi
181+
182+ # 4) GPU Operator must-gather (for all clusters)
134183GPU_MG_DIR=" ${ARTIFACTS_DIR} /gpu-operator-must-gather"
135184mkdir -p " ${GPU_MG_DIR} "
136185echo " [INFO] Running GPU Operator must-gather..."
@@ -164,6 +213,11 @@ if [ -n "${UPLOAD_URL_BASE:-}" ]; then
164213 " ${UPLOAD_URL_BASE} /${NODE_NAME} /${TIMESTAMP} /$( basename " ${GCP_SOS_REPORT} " ) " || true
165214 echo " [UPLOAD_SUCCESS] GCP SOS report uploaded: $( basename " ${GCP_SOS_REPORT} " ) "
166215 fi
216+ if [ -n " ${AWS_SOS_REPORT} " ] && [ -f " ${AWS_SOS_REPORT} " ]; then
217+ curl -fsS -X PUT --upload-file " ${AWS_SOS_REPORT} " \
218+ " ${UPLOAD_URL_BASE} /${NODE_NAME} /${TIMESTAMP} /$( basename " ${AWS_SOS_REPORT} " ) " || true
219+ echo " [UPLOAD_SUCCESS] AWS SOS report uploaded: $( basename " ${AWS_SOS_REPORT} " ) "
220+ fi
167221fi
168222
169223echo " [INFO] Done. Artifacts under ${ARTIFACTS_DIR} "
0 commit comments