Skip to content

Commit 1b8bc94

Browse files
committed
integrate gpu-driver-util into the driver images
Signed-off-by: Tariq Ibrahim <[email protected]>
1 parent 0f4ecad commit 1b8bc94

File tree

14 files changed

+192
-27
lines changed

14 files changed

+192
-27
lines changed

rhel8/Dockerfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ ENV PATH /usr/local/go/bin:$PATH
1717
WORKDIR /work
1818

1919
RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
20-
cd driver/vgpu/src && \
21-
go build -o vgpu-util && \
22-
mv vgpu-util /work
20+
go build -C driver/vgpu/src -o vgpu-util && \
21+
mv driver/vgpu/src/vgpu-util /work && \
22+
go build -C driver/gpu-driver-util -o gpu-driver-util && \
23+
mv driver/gpu-driver-util/gpu-driver-util /work
2324

2425
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi8
2526

@@ -36,6 +37,8 @@ ENV DRIVER_VERSION=$DRIVER_VERSION
3637
# Arg to indicate if driver type is either of passthrough/baremetal or vgpu
3738
ARG DRIVER_TYPE=passthrough
3839
ENV DRIVER_TYPE=$DRIVER_TYPE
40+
ARG DRIVER_BRANCH=550
41+
ENV DRIVER_BRANCH=$DRIVER_BRANCH
3942
ARG VGPU_LICENSE_SERVER_TYPE=NLS
4043
ENV VGPU_LICENSE_SERVER_TYPE=$VGPU_LICENSE_SERVER_TYPE
4144
# Enable vGPU version compability check by default
@@ -84,6 +87,7 @@ COPY ocp_dtk_entrypoint /usr/local/bin
8487
COPY common.sh /usr/local/bin
8588

8689
COPY --from=build /work/vgpu-util /usr/local/bin
90+
COPY --from=build /work/gpu-driver-util /usr/local/bin
8791

8892
WORKDIR /drivers
8993

rhel8/nvidia-driver

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -eu
66
RUN_DIR=/run/nvidia
77
PID_FILE=${RUN_DIR}/${0##*/}.pid
88
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
9+
DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing DRIVER_BRANCH env"}
910
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
1011
NUM_VGPU_DEVICES=0
1112
NVIDIA_MODULE_PARAMS=()
@@ -17,9 +18,7 @@ USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
1718
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
1819
RHEL_VERSION=${RHEL_VERSION:-""}
1920
RHEL_MAJOR_VERSION=8
20-
21-
OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
22-
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
21+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
2322

2423
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
2524
echo "DRIVER_ARCH is $DRIVER_ARCH"
@@ -577,6 +576,24 @@ _start_vgpu_topology_daemon() {
577576
nvidia-topologyd
578577
}
579578

579+
_resolve_kernel_type() {
580+
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
581+
KERNEL_TYPE=kernel
582+
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
583+
KERNEL_TYPE=kernel-open
584+
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
585+
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
586+
if [ $? -ne 0 ]; then
587+
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
588+
tail -n 3 /var/log/gpu-driver-util.log
589+
return 1
590+
fi
591+
else
592+
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
593+
return 1
594+
fi
595+
}
596+
580597
_prepare() {
581598
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
582599
_find_vgpu_driver_version || exit 1
@@ -797,5 +814,6 @@ if [ $# -ne 0 ]; then
797814
fi
798815

799816
_resolve_rhel_version || exit 1
817+
_resolve_kernel_type || exit 1
800818

801819
$command

rhel8/ocp_dtk_entrypoint

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ nv-ctr-run-with-dtk() {
2525
/usr/local/bin/nvidia-driver \
2626
/usr/local/bin/common.sh \
2727
/usr/local/bin/extract-vmlinux \
28+
/usr/local/bin/gpu-driver-util \
2829
/usr/local/bin/vgpu-util \
2930
/drivers \
3031
/licenses \
@@ -136,6 +137,7 @@ dtk-build-driver() {
136137
"$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \
137138
"$DRIVER_TOOLKIT_SHARED_DIR/common.sh" \
138139
"$DRIVER_TOOLKIT_SHARED_DIR/extract-vmlinux" \
140+
"$DRIVER_TOOLKIT_SHARED_DIR/gpu-driver-util" \
139141
"$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" \
140142
"${DRIVER_TOOLKIT_SHARED_DIR}/bin"
141143

rhel9/Dockerfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ ENV PATH /usr/local/go/bin:$PATH
1717
WORKDIR /work
1818

1919
RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
20-
cd driver/vgpu/src && \
21-
go build -o vgpu-util && \
22-
mv vgpu-util /work
20+
go build -C driver/vgpu/src -o vgpu-util && \
21+
mv driver/vgpu/src/vgpu-util /work && \
22+
go build -C driver/gpu-driver-util -o gpu-driver-util && \
23+
mv driver/gpu-driver-util/gpu-driver-util /work
2324

2425
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi9
2526

@@ -36,6 +37,8 @@ ENV DRIVER_VERSION=$DRIVER_VERSION
3637
# Arg to indicate if driver type is either of passthrough/baremetal or vgpu
3738
ARG DRIVER_TYPE=passthrough
3839
ENV DRIVER_TYPE=$DRIVER_TYPE
40+
ARG DRIVER_BRANCH=550
41+
ENV DRIVER_BRANCH=$DRIVER_BRANCH
3942
ARG VGPU_LICENSE_SERVER_TYPE=NLS
4043
ENV VGPU_LICENSE_SERVER_TYPE=$VGPU_LICENSE_SERVER_TYPE
4144
# Enable vGPU version compability check by default
@@ -78,6 +81,7 @@ COPY ocp_dtk_entrypoint /usr/local/bin
7881
COPY common.sh /usr/local/bin
7982

8083
COPY --from=build /work/vgpu-util /usr/local/bin
84+
COPY --from=build /work/gpu-driver-util /usr/local/bin
8185

8286
WORKDIR /drivers
8387

rhel9/nvidia-driver

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -eu
66
RUN_DIR=/run/nvidia
77
PID_FILE=${RUN_DIR}/${0##*/}.pid
88
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
9+
DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing DRIVER_BRANCH env"}
910
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
1011
NUM_VGPU_DEVICES=0
1112
NVIDIA_MODULE_PARAMS=()
@@ -17,9 +18,7 @@ USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
1718
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
1819
RHEL_VERSION=${RHEL_VERSION:-""}
1920
RHEL_MAJOR_VERSION=9
20-
21-
OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
22-
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
21+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
2322

2423
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
2524
echo "DRIVER_ARCH is $DRIVER_ARCH"
@@ -571,6 +570,24 @@ _find_vgpu_driver_version() {
571570
return 0
572571
}
573572

573+
_resolve_kernel_type() {
574+
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
575+
KERNEL_TYPE=kernel
576+
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
577+
KERNEL_TYPE=kernel-open
578+
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
579+
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
580+
if [ $? -ne 0 ]; then
581+
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
582+
tail -n 3 /var/log/gpu-driver-util.log
583+
return 1
584+
fi
585+
else
586+
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
587+
return 1
588+
fi
589+
}
590+
574591
_start_vgpu_topology_daemon() {
575592
type nvidia-topologyd > /dev/null 2>&1 || return 0
576593
echo "Starting nvidia-topologyd.."
@@ -797,5 +814,6 @@ if [ $# -ne 0 ]; then
797814
fi
798815

799816
_resolve_rhel_version || exit 1
817+
_resolve_kernel_type || exit 1
800818

801819
$command

rhel9/ocp_dtk_entrypoint

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ nv-ctr-run-with-dtk() {
2525
/usr/local/bin/nvidia-driver \
2626
/usr/local/bin/common.sh \
2727
/usr/local/bin/extract-vmlinux \
28+
/usr/local/bin/gpu-driver-util \
2829
/usr/local/bin/vgpu-util \
2930
/drivers \
3031
/licenses \
@@ -136,6 +137,7 @@ dtk-build-driver() {
136137
"$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \
137138
"$DRIVER_TOOLKIT_SHARED_DIR/common.sh" \
138139
"$DRIVER_TOOLKIT_SHARED_DIR/extract-vmlinux" \
140+
"$DRIVER_TOOLKIT_SHARED_DIR/gpu-driver-util" \
139141
"$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" \
140142
"${DRIVER_TOOLKIT_SHARED_DIR}/bin"
141143

ubuntu20.04/Dockerfile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
2828
WORKDIR /work
2929

3030
RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
31-
cd driver/vgpu/src && \
32-
go build -o vgpu-util && \
33-
mv vgpu-util /work
31+
go build -C driver/vgpu/src -o vgpu-util && \
32+
mv driver/vgpu/src/vgpu-util /work && \
33+
go build -C driver/gpu-driver-util -o gpu-driver-util && \
34+
mv driver/gpu-driver-util/gpu-driver-util /work
3435

3536
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu20.04
3637

@@ -72,6 +73,7 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
7273
COPY nvidia-driver /usr/local/bin
7374

7475
COPY --from=build /work/vgpu-util /usr/local/bin
76+
COPY --from=build /work/gpu-driver-util /usr/local/bin
7577

7678
ADD drivers drivers/
7779

ubuntu20.04/nvidia-driver

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ NVIDIA_MODESET_MODULE_PARAMS=()
1616
NVIDIA_PEERMEM_MODULE_PARAMS=()
1717
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}
1818

19-
OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
20-
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
19+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
2120

2221
export DEBIAN_FRONTEND=noninteractive
2322

@@ -477,6 +476,24 @@ _shutdown() {
477476
return 1
478477
}
479478

479+
_resolve_kernel_type() {
480+
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
481+
KERNEL_TYPE=kernel
482+
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
483+
KERNEL_TYPE=kernel-open
484+
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
485+
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
486+
if [ $? -ne 0 ]; then
487+
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
488+
tail -n 3 /var/log/gpu-driver-util.log
489+
return 1
490+
fi
491+
else
492+
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
493+
return 1
494+
fi
495+
}
496+
480497
_find_vgpu_driver_version() {
481498
local count=""
482499
local version=""
@@ -520,6 +537,8 @@ init() {
520537
_find_vgpu_driver_version || exit 1
521538
fi
522539

540+
_resolve_kernel_type || exit 1
541+
523542
# Install the userspace components and copy the kernel module sources.
524543
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
525544
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
@@ -592,6 +611,8 @@ update() {
592611
fi
593612
exec 3>&-
594613

614+
_resolve_kernel_type || exit 1
615+
595616
# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
596617
# only non-vgpu driver types
597618
if [ "${DRIVER_TYPE}" != "vgpu" ]; then

ubuntu22.04/Dockerfile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
2828
WORKDIR /work
2929

3030
RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
31-
cd driver/vgpu/src && \
32-
go build -o vgpu-util && \
33-
mv vgpu-util /work
31+
go build -C driver/vgpu/src -o vgpu-util && \
32+
mv driver/vgpu/src/vgpu-util /work && \
33+
go build -C driver/gpu-driver-util -o gpu-driver-util && \
34+
mv driver/gpu-driver-util/gpu-driver-util /work
3435

3536
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
3637

@@ -72,6 +73,7 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
7273
COPY nvidia-driver /usr/local/bin
7374

7475
COPY --from=build /work/vgpu-util /usr/local/bin
76+
COPY --from=build /work/gpu-driver-util /usr/local/bin
7577

7678
ADD drivers drivers/
7779

ubuntu22.04/nvidia-driver

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@ NVIDIA_UVM_MODULE_PARAMS=()
1515
NVIDIA_MODESET_MODULE_PARAMS=()
1616
NVIDIA_PEERMEM_MODULE_PARAMS=()
1717
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}
18-
19-
OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
20-
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
18+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
2119

2220
export DEBIAN_FRONTEND=noninteractive
2321

@@ -481,6 +479,24 @@ _shutdown() {
481479
return 1
482480
}
483481

482+
_resolve_kernel_type() {
483+
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
484+
KERNEL_TYPE=kernel
485+
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
486+
KERNEL_TYPE=kernel-open
487+
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
488+
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
489+
if [ $? -ne 0 ]; then
490+
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
491+
tail -n 3 /var/log/gpu-driver-util.log
492+
return 1
493+
fi
494+
else
495+
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
496+
return 1
497+
fi
498+
}
499+
484500
_find_vgpu_driver_version() {
485501
local count=""
486502
local version=""
@@ -524,6 +540,8 @@ init() {
524540
_find_vgpu_driver_version || exit 1
525541
fi
526542

543+
_resolve_kernel_type || exit 1
544+
527545
# Install the userspace components and copy the kernel module sources.
528546
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
529547
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
@@ -596,6 +614,8 @@ update() {
596614
fi
597615
exec 3>&-
598616

617+
_resolve_kernel_type || exit 1
618+
599619
# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
600620
# only non-vgpu driver types
601621
if [ "${DRIVER_TYPE}" != "vgpu" ]; then

0 commit comments

Comments
 (0)