diff --git a/Makefile b/Makefile index ac56f7ba..834eb645 100644 --- a/Makefile +++ b/Makefile @@ -55,16 +55,17 @@ OUT_IMAGE = $(OUT_IMAGE_NAME):$(OUT_IMAGE_TAG) ##### Public rules ##### DISTRIBUTIONS := ubuntu18.04 ubuntu20.04 ubuntu22.04 ubuntu24.04 signed_ubuntu20.04 signed_ubuntu22.04 signed_ubuntu24.04 rhel8 rhel9 flatcar fedora36 sles15.3 precompiled_rhcos +RHCOS_VERSIONS := rhcos4.14 rhcos4.15 rhcos4.16 rhcos4.17 rhcos4.18 rhcos4.19 rhcos4.20 PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) BASE_FROM := noble jammy focal PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) -VGPU_GUEST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuguest-%, $(DISTRIBUTIONS)) -VGPU_HOST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuhost-%, $(DISTRIBUTIONS)) +VGPU_GUEST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuguest-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) +VGPU_HOST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuhost-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) DRIVER_PUSH_TARGETS := $(foreach push_target, $(PUSH_TARGETS), $(addprefix $(push_target)-, $(DRIVER_VERSIONS))) BUILD_TARGETS := $(patsubst %, build-%, $(DISTRIBUTIONS)) DRIVER_BUILD_TARGETS := $(foreach build_target, $(BUILD_TARGETS), $(addprefix $(build_target)-, $(DRIVER_VERSIONS))) -VGPU_GUEST_DRIVER_BUILD_TARGETS := $(patsubst %, build-vgpuguest-%, $(DISTRIBUTIONS)) -VGPU_HOST_DRIVER_BUILD_TARGETS := $(patsubst %, build-vgpuhost-%, $(DISTRIBUTIONS)) +VGPU_GUEST_DRIVER_BUILD_TARGETS := $(patsubst %, build-vgpuguest-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) +VGPU_HOST_DRIVER_BUILD_TARGETS := $(patsubst %, build-vgpuhost-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) TEST_TARGETS := $(patsubst %, test-%, $(DISTRIBUTIONS)) PULL_TARGETS := $(patsubst %, pull-%, $(DISTRIBUTIONS)) DRIVER_PULL_TARGETS := $(foreach pull_target, $(PULL_TARGETS), $(addprefix $(pull_target)-, $(DRIVER_VERSIONS))) @@ -243,7 +244,15 @@ build-vgpuguest-%: DOCKERFILE = $(CURDIR)/$(SUBDIR)/Dockerfile # Remove '-grid' substring in the image tag build-vgpuguest-%: DRIVER_TAG = $(DRIVER_VERSION:-grid=) -build-vgpuguest-rhcos%: SUBDIR = rhel8 +# Source of truth for RHEL and CoreOS compatibility https://access.redhat.com/articles/6907891 +# Note: INTENTIONALLY replace SUBDIR, as per the above macro would set SUBDIR to "rchos4.18" for `make build-vgpuguest-rhcos4.18` +build-vgpuguest-rhcos4.14: SUBDIR = rhel8 +build-vgpuguest-rhcos4.15: SUBDIR = rhel9 +build-vgpuguest-rhcos4.16: SUBDIR = rhel9 +build-vgpuguest-rhcos4.17: SUBDIR = rhel9 +build-vgpuguest-rhcos4.18: SUBDIR = rhel9 +build-vgpuguest-rhcos4.19: SUBDIR = rhel9 +build-vgpuguest-rhcos4.20: SUBDIR = rhel9 $(VGPU_GUEST_DRIVER_BUILD_TARGETS): DOCKER_BUILDKIT=1 \ @@ -280,7 +289,17 @@ build-vgpuhost-%: DIST = $(word 3,$(subst -, ,$@)) build-vgpuhost-%: SUBDIR = $(word 3,$(subst -, ,$@)) build-vgpuhost-%: DOCKERFILE = $(CURDIR)/vgpu-manager/$(SUBDIR)/Dockerfile -build-vgpuhost-rhcos%: SUBDIR = rhel8 +# Source of truth for RHEL and CoreOS compatibility https://access.redhat.com/articles/6907891 +# Note: INTENTIONALLY replace SUBDIR, as per the above macro would set SUBDIR to "rchos4.X" for `make build-vgpuhost-rhcos4.X` +build-vgpuhost-rhcos4.12: SUBDIR = rhel8 +build-vgpuhost-rhcos4.13: SUBDIR = rhel8 +build-vgpuhost-rhcos4.14: SUBDIR = rhel8 +build-vgpuhost-rhcos4.15: SUBDIR = rhel9 +build-vgpuhost-rhcos4.16: SUBDIR = rhel9 +build-vgpuhost-rhcos4.17: SUBDIR = rhel9 +build-vgpuhost-rhcos4.18: SUBDIR = rhel9 +build-vgpuhost-rhcos4.19: SUBDIR = rhel9 +build-vgpuhost-rhcos4.20: SUBDIR = rhel9 $(VGPU_HOST_DRIVER_BUILD_TARGETS): DOCKER_BUILDKIT=1 \ @@ -297,8 +316,6 @@ $(VGPU_HOST_DRIVER_BUILD_TARGETS): --file $(DOCKERFILE) \ $(CURDIR)/vgpu-manager/$(SUBDIR) - - # $(VGPU_HOST_DRIVER_PUSH_TARGETS) is in the form of push-vgpuhost-$(DIST) # VGPU_HOST_DRIVER_VERSION must be defined in the environment when invoking this target. push-vgpuhost-%: $(if $(VGPU_HOST_DRIVER_VERSION),,$(error "VGPU_HOST_DRIVER_VERSION is not set")) diff --git a/vgpu-manager/rhel9/Dockerfile b/vgpu-manager/rhel9/Dockerfile new file mode 100644 index 00000000..75b7f20f --- /dev/null +++ b/vgpu-manager/rhel9/Dockerfile @@ -0,0 +1,34 @@ +FROM nvcr.io/nvidia/cuda:13.0.1-base-ubi9 + +ARG DRIVER_VERSION +ENV DRIVER_VERSION=$DRIVER_VERSION +ARG DRIVER_ARCH=x86_64 +ENV DRIVER_ARCH=$DRIVER_ARCH + +RUN mkdir -p /driver +WORKDIR /driver +COPY NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}-vgpu-kvm.run . +RUN chmod +x NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}-vgpu-kvm.run + +COPY nvidia-driver /usr/local/bin +COPY ocp_dtk_entrypoint /usr/local/bin + +LABEL io.k8s.display-name="NVIDIA vGPU Manager Container" +LABEL name="NVIDIA vGPU Manager Container" +LABEL vendor="NVIDIA" +LABEL version="${DRIVER_VERSION}" +LABEL release="N/A" +LABEL summary="Provision the NVIDIA vGPU Manager through containers" +LABEL description="See summary" + +# Install / upgrade packages here that are required to resolve CVEs +ARG CVE_UPDATES +RUN if [ -n "${CVE_UPDATES}" ]; then \ + yum update -y ${CVE_UPDATES} && \ + rm -rf /var/cache/yum/*; \ + fi + +# Add NGC DL license from the CUDA image +RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE + +ENTRYPOINT ["nvidia-driver", "init"] diff --git a/vgpu-manager/rhel9/nvidia-driver b/vgpu-manager/rhel9/nvidia-driver new file mode 100755 index 00000000..26d8a9f5 --- /dev/null +++ b/vgpu-manager/rhel9/nvidia-driver @@ -0,0 +1,252 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +set -xe + +DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"} +DRIVER_RESET_RETRIES=10 +DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:-15} +RUN_DIR=/run/nvidia + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + mount --make-runbindable /sys + mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + mount --rbind / ${RUN_DIR}/driver + + echo "Change device files security context for selinux compatibility" + chcon -R -t container_file_t ${RUN_DIR}/driver/dev +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + umount -l -R ${RUN_DIR}/driver + fi +} + +# Create /dev/char directory if it doesn't exist inside the container. +# Without this directory, nvidia-vgpu-mgr will fail to create symlinks +# under /dev/char for new devices nodes. +_create_dev_char_directory() { + if [ ! -d "/dev/char" ]; then + echo "Creating '/dev/char' directory" + mkdir -p /dev/char + fi +} + +_set_fw_search_path() { + local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" + local fw_path_config_file="/sys/module/firmware_class/parameters/path" + + if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then + echo "WARNING: A search path is already configured in $fw_path_config_file" + echo " Retaining the current configuration. Note, GSP firmware may not be found and thus won't be used by the NVIDIA driver." + return + fi + + echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path" + echo -n "$nv_fw_search_path" > $fw_path_config_file +} + +_install_driver() { + local tmp_dir=$(mktemp -d) + + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}-vgpu-kvm.run --ui=none --no-questions --tmpdir ${tmp_dir} --no-systemd +} + +# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons +_load_driver() { + /usr/bin/nvidia-vgpud + /usr/bin/nvidia-vgpu-mgr & + + # check nvidia drivers are loaded + if [ ! -f /sys/module/nvidia_vgpu_vfio/refcnt ] || [ ! -f /sys/module/nvidia/refcnt ]; then + echo "Failed to load nvidia driver" + return 1 + fi + return 0 +} + +# Enable virtual functions for all physical GPUs on the node that support SR-IOV. +# Retry logic is to account for when the driver is busy (i.e. during driver initialization) +_enable_vfs() { + # Wait before attempting to create VFs to ensure the driver has finished initializing. + # This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero + # exit code even though VF creation fails. + sleep $DELAY_BEFORE_VF_CREATION + + local retry + for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do + if /usr/lib/nvidia/sriov-manage -e ALL; then + return 0 + fi + if [ $retry == $DRIVER_RESET_RETRIES ]; then + echo "Failed to enable VFs" + fi + done + return 1 +} + +# Disable virtual functions for all physical GPUs on the node that support SR-IOV. +# Retry logic is to account for when the driver is busy (i.e. during driver initialization) +_disable_vfs() { + local retry + for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do + if /usr/lib/nvidia/sriov-manage -d ALL; then + return 0 + fi + if [ $retry == $DRIVER_RESET_RETRIES ]; then + echo "Failed to disable VFs" + fi + done + return 1 +} + +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_vgpu_vfio_refs=0 + + if [ -f /var/run/nvidia-vgpu-mgr/nvidia-vgpu-mgr.pid ]; then + echo "Stopping NVIDIA vGPU Manager..." + local pid=$(< /var/run/nvidia-vgpu-mgr/nvidia-vgpu-mgr.pid) + + kill -TERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA vGPU Manager" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_vgpu_vfio/refcnt ]; then + nvidia_vgpu_vfio_refs=$(< /sys/module/nvidia_vgpu_vfio/refcnt) + rmmod_args+=("nvidia_vgpu_vfio") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + + # TODO: check if nvidia module is in use by checking refcnt + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod ${rmmod_args[@]} + if [ "$?" != "0" ]; then + return 1 + fi + fi + return 0 +} + +_shutdown() { + if _disable_vfs && _unload_driver; then + _unmount_rootfs + return 0 + fi + echo "Failed to cleanup driver" + return 1 +} + +build() { + echo "build() not implemented" +} + +load() { + echo "load() not implemented" +} + +update() { + echo "update() not implemented" +} + +init() { + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + if ! _unload_driver; then + echo "Previous NVIDIA driver installation cannot be removed. Exiting" + exit 1 + fi + _unmount_rootfs + _create_dev_char_directory + _set_fw_search_path + _install_driver + _load_driver || exit 1 + _mount_rootfs + _enable_vfs + + # In certain scenarios, /sys/class/mdev_bus is not populated with the correct list of devices (PFs and possible VFs) at this point. + # Re-run nvdidia-vgpud to ensure /sys/class/mdev_bus is populated correctly. And restart nvidia-vgpu-mgr if previously killed. + nvidia-vgpud & + pgrep nvidia-vgpu-mgr >/dev/null || (echo "Restarting nvidia-vgpu-mgr after previously killed" && nvidia-vgpu-mgr &) + + set +x + echo "Done, now waiting for signal" + trap "echo 'Caught signal'; _shutdown; trap - EXIT; exit" HUP INT QUIT PIPE TERM + + while true; do + sleep 15 + pgrep nvidia-vgpu-mgr >/dev/null || (echo "ERROR: nvidia-vgpu-mgr daemon is no longer running. Exiting." && exit 1) + done +} + + +usage() { + cat >&2 < "$DRIVER_TOOLKIT_SHARED_DIR/env" + + touch "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" + fi + + set +x + while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" ]]; do + if [[ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken" ]]; then + echo "WARNING: broken driver toolkit detected" + exit 1 + # TODO: use entitlement based fallback + #exec bash -x nvidia-driver init + fi + echo "$(date) Waiting for openshift-driver-toolkit-ctr container to start ..." + sleep 15 + done + + echo "$(date) openshift-driver-toolkit-ctr started." + + # TODO: Currently dtk-build-driver will actually install and load the driver as well. + # Uncomment the following if/when dtk-build-driver only builds precompiled driver. + #while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]]; do + # echo "$(date) Waiting for openshift-driver-toolkit-ctr container to build the precompiled driver ..." + # sleep 15 + #done + + #echo "$(date) openshift-driver-toolkit-ctr finished building driver." + set -x + sleep infinity +} + +dtk-build-driver() { + if [[ "${RHCOS_IMAGE_MISSING:-}" == "true" ]]; then + echo "WARNING: 'istag/driver-toolkit:${RHCOS_VERSION} -n openshift' missing, nothing to do in openshift-driver-toolkit-ctr container" + sleep +inf + fi + + if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then + echo "WARNING: broken Driver Toolkit image detected:" + echo "- Node kernel: $(uname -r)" + echo "- Kernel package: $(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core)" + + # TODO: log entitlement based fallback + #echo "INFO: informing nvidia-driver-ctr to fallback on entitled-build." + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken" + echo "INFO: nothing else to do in openshift-driver-toolkit-ctr container, sleeping forever." + sleep +inf + fi + + # Shared directory is prepared before entering this script. See + # 'until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ] ...' + # in the Pod command/args + + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" + + set -x + set -o allexport + source "${DRIVER_TOOLKIT_SHARED_DIR}/env" + set +o allexport; + + # if this directory already exists, + # NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run fails to run + # and doesn't create its files. This may happen when the + # container fails and restart its execution, leading to + # hard-to-understand "unrelated" errors in the following of the script execution + + rm -rf "${DRIVER_TOOLKIT_SHARED_DIR}/driver/NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}"; + + mkdir "${DRIVER_TOOLKIT_SHARED_DIR}/bin" -p + + cp -v \ + "$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \ + "${DRIVER_TOOLKIT_SHARED_DIR}/bin" + + export PATH="${DRIVER_TOOLKIT_SHARED_DIR}/bin:$PATH"; + + # ensure lspci is installed, as 'sriov-manage' script requires it + if ! $(lspci >/dev/null); then + dnf install -y pciutils && rm -rf /var/cache/yum/* + fi + + # upon catching a signal, terminate child process to trigger driver cleanup + trap 'echo "Caught signal"; kill "${child_pid}"; wait "${child_pid}"; exit' HUP INT QUIT PIPE TERM + cd "${DRIVER_TOOLKIT_SHARED_DIR}/driver"; + echo "#" + echo "# Executing nvidia-driver install script ..." + echo "#" + bash -x "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver" init & + + child_pid="$!" + wait "${child_pid}" + + # TODO: only build driver in the dtk, and let main container load. + # 'nvidia-driver init' will only exit if it fails + echo "Driver installation failed. Exiting ..." + exit 1 +} + +usage() { + cat >&2 <