Propagate PyTorch CUDA flags via CMake target

stotko · stotko · commit 5d5a4c3c09e2 · 2025-05-13T10:16:03.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,17 +11,3 @@ endif()
 list(APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/charonload")
 
 find_package(charonload REQUIRED GLOBAL)
-
-
-set(MODIFIED_CMAKE_VARIABLES
-    # Set by Caffe2/public/cuda.cmake as part of TorchConfig.cmake
-    "CMAKE_CUDA_FLAGS"
-    "CMAKE_CUDA_FLAGS_DEBUG"
-    "CMAKE_CUDA_FLAGS_MINSIZEREL"
-    "CMAKE_CUDA_FLAGS_RELEASE"
-    "CMAKE_CUDA_FLAGS_RELWITHDEBINFO"
-)
-
-foreach(var IN LISTS MODIFIED_CMAKE_VARIABLES)
-    set(${var} ${${var}} PARENT_SCOPE)
-endforeach()
diff --git a/docs/src/pytorch_handling.md b/docs/src/pytorch_handling.md
@@ -40,6 +40,16 @@ Set required PIC flag for linking.
 
 ::::
 
+::::{grid-item-card}
+:link: pytorch_handling/cuda_flags
+:link-type: doc
+
+**CUDA Flags**
+^^^
+Set required CUDA flags for parents and siblings.
+
+::::
+
 :::::
 
 
@@ -49,4 +59,5 @@ Set required PIC flag for linking.
 pytorch_handling/cpp_standard
 pytorch_handling/cpp11_abi
 pytorch_handling/position_independent_code
+pytorch_handling/cuda_flags
 ```
diff --git a/docs/src/pytorch_handling/cuda_flags.md b/docs/src/pytorch_handling/cuda_flags.md
@@ -0,0 +1,13 @@
+# CUDA Flags
+
+In order to simplify writing CUDA kernels, the PyTorch C++ library enables several compiler flags:
+
+- Using CUDA architectures of detected GPUs
+- Enabling `__host__ __device__` lambda functions, e.g., with thrust/CUB algorithms
+- Enabling relaxed `constexpr` rules to reuse, e.g., `std::clamp` in kernels and `__device__` functions
+- Suppressing some noisy warnings
+
+However, the PyTorch C++ library provides these flags by modifying the (old-school) [``CUDA_NVCC_FLAGS``](<inv:cmake.org#module/FindCUDA>) variable. Although CMake will pick up the variable, the modifications are **only** visible in the directory (and subdirectory) scope(s) where PyTorch has been found by [``find_package``](<inv:cmake.org#command/find_package>). This may lead to compiler errors for depending targets in parent or sibling directories when finding PyTorch with the ``GLOBAL`` option enabled, as this promotes **only** the respective targets to all scopes but leaves the variables modifications in the calling scope.
+
+
+Charonload automatically detects the modified compile flags and attaches them as an `INTERFACE` property to the CUDA target of the PyTorch C++ library, such that they will be correctly propagated to any linking target.
diff --git a/src/charonload/cmake/charonload-config.cmake b/src/charonload/cmake/charonload-config.cmake
@@ -81,9 +81,25 @@ if(charonload_FIND_QUIETLY)
     set(CUDNN_FIND_QUIETLY 1)
 endif()
 
+# Back up CUDA_NVCC_FLAGS for later restoring
+set(CHARONLOAD_CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS})
+
 find_dependency(Torch)
 
+list(POP_BACK CMAKE_MESSAGE_INDENT)
+
 if(Torch_FOUND)
+    # 1. CUDA flag patching
+    if(NOT CHARONLOAD_CUDA_NVCC_FLAGS STREQUAL CUDA_NVCC_FLAGS AND TARGET torch_cuda)
+        # Use modified CUDA_NVCC_FLAGS
+        target_compile_options(torch_cuda INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS}>)
+
+        # Restore CUDA_NVCC_FLAGS
+        set(CUDA_NVCC_FLAGS ${CHARONLOAD_CUDA_NVCC_FLAGS})
+        message(STATUS "Patched target \"torch_cuda\" with modified \"CUDA_NVCC_FLAGS\" settings and rolled back the variable modifications.")
+    endif()
+
+    # 2. Python bindings library
     get_target_property(TORCH_LIBRARY_LOCATION torch LOCATION)
     get_filename_component(TORCH_LIB_SEARCH_PATH ${TORCH_LIBRARY_LOCATION} DIRECTORY)
 
@@ -102,7 +118,8 @@ if(Torch_FOUND)
     endif()
 endif()
 
-list(POP_BACK CMAKE_MESSAGE_INDENT)
+# Clean up backup variable
+unset(CHARONLOAD_CUDA_NVCC_FLAGS)
 
 
 include("${CMAKE_CURRENT_LIST_DIR}/torch/cxx_standard.cmake")
diff --git a/tests/data/torch_cuda/two_times_cuda.cu b/tests/data/torch_cuda/two_times_cuda.cu
@@ -2,6 +2,10 @@
 #include <ATen/ops/zeros_like.h>
 #include <c10/cuda/CUDAException.h>
 
+#ifndef __CUDACC_EXTENDED_LAMBDA__
+    #error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated"
+#endif
+
 template <class T>
 __global__ void
 two_times_kernel(const T* const input, T* const output, const std::size_t N)
diff --git a/tests/data/torch_cuda_subdirectory/CMakeLists.txt b/tests/data/torch_cuda_subdirectory/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.27)
+
+project(torch_cuda_subdirectory LANGUAGES CXX CUDA)
+
+add_subdirectory(lib_dir)
+
+# Must come AFTER lib_dir
+add_subdirectory(binding_dir)
diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/CMakeLists.txt b/tests/data/torch_cuda_subdirectory/binding_dir/CMakeLists.txt
@@ -0,0 +1,9 @@
+# This short-circuits from GLOBAL find_package
+find_package(charonload)
+
+if(charonload_FOUND)
+    charonload_add_torch_library(${TORCH_EXTENSION_NAME} MODULE)
+
+    target_sources(${TORCH_EXTENSION_NAME} PRIVATE bindings.cpp three_times_cuda.cu)
+    target_link_libraries(${TORCH_EXTENSION_NAME} PRIVATE torch_cuda_subdirectory)
+endif()
diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/bindings.cpp b/tests/data/torch_cuda_subdirectory/binding_dir/bindings.cpp
@@ -0,0 +1,40 @@
+#include <torch/python.h>
+
+#include "three_times_cuda.h"
+#include "two_times_cuda.h"
+
+using namespace pybind11::literals;
+
+#define STRINGIFY_IMPL(x) #x
+#define STRINGIFY(a) STRINGIFY_IMPL(a)
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.doc() = "A C++/CUDA extension module named \"" STRINGIFY(TORCH_EXTENSION_NAME) "\" that is built just-in-time.";
+
+    m.def("two_times", &two_times, "input"_a, R"(
+        Multiply the given input tensor by a factor of 2 on the GPU using CUDA.
+
+        Parameters
+        ----------
+        input
+            A tensor with arbitrary shape and dtype.
+
+        Returns
+        -------
+        A new tensor with the same shape and dtype as ``input`` and where each value is multiplied by 2.
+    )");
+
+    m.def("three_times", &three_times, "input"_a, R"(
+        Multiply the given input tensor by a factor of 3 on the GPU using CUDA.
+
+        Parameters
+        ----------
+        input
+            A tensor with arbitrary shape and dtype.
+
+        Returns
+        -------
+        A new tensor with the same shape and dtype as ``input`` and where each value is multiplied by 3.
+    )");
+}
diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.cu b/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.cu
@@ -0,0 +1,37 @@
+#include <ATen/Dispatch.h>
+#include <ATen/ops/zeros_like.h>
+#include <c10/cuda/CUDAException.h>
+
+#ifndef __CUDACC_EXTENDED_LAMBDA__
+    #error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated"
+#endif
+
+template <class T>
+__global__ void
+three_times_kernel(const T* const input, T* const output, const std::size_t N)
+{
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
+    {
+        output[i] = T(3) * input[i];
+    }
+}
+
+at::Tensor
+three_times(const at::Tensor& input)
+{
+    auto output = at::zeros_like(input);
+
+    AT_DISPATCH_ALL_TYPES(input.scalar_type(),
+                          "three_times_kernel",
+                          [&]()
+                          {
+                              const std::uint32_t block_size = 128;
+                              const std::uint32_t num_blocks = (input.numel() + block_size - 1) / block_size;
+                              three_times_kernel<<<num_blocks, block_size>>>(input.data_ptr<scalar_t>(),
+                                                                             output.data_ptr<scalar_t>(),
+                                                                             input.numel());
+                              C10_CUDA_KERNEL_LAUNCH_CHECK();
+                          });
+
+    return output;
+}
diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.h b/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+at::Tensor
+three_times(const at::Tensor& input);
diff --git a/tests/data/torch_cuda_subdirectory/lib_dir/CMakeLists.txt b/tests/data/torch_cuda_subdirectory/lib_dir/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Use GLOBAL import to force short-cicuiting
+find_package(charonload GLOBAL)
+
+if(charonload_FOUND)
+    charonload_add_torch_library(torch_cuda_subdirectory STATIC)
+
+    target_include_directories(torch_cuda_subdirectory PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
+    target_sources(torch_cuda_subdirectory PRIVATE two_times_cuda.cu)
+endif()
diff --git a/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.cu b/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.cu
@@ -0,0 +1,37 @@
+#include <ATen/Dispatch.h>
+#include <ATen/ops/zeros_like.h>
+#include <c10/cuda/CUDAException.h>
+
+#ifndef __CUDACC_EXTENDED_LAMBDA__
+    #error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated"
+#endif
+
+template <class T>
+__global__ void
+two_times_kernel(const T* const input, T* const output, const std::size_t N)
+{
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
+    {
+        output[i] = T(2) * input[i];
+    }
+}
+
+at::Tensor
+two_times(const at::Tensor& input)
+{
+    auto output = at::zeros_like(input);
+
+    AT_DISPATCH_ALL_TYPES(input.scalar_type(),
+                          "two_times_cuda",
+                          [&]()
+                          {
+                              const std::uint32_t block_size = 128;
+                              const std::uint32_t num_blocks = (input.numel() + block_size - 1) / block_size;
+                              two_times_kernel<<<num_blocks, block_size>>>(input.data_ptr<scalar_t>(),
+                                                                           output.data_ptr<scalar_t>(),
+                                                                           input.numel());
+                              C10_CUDA_KERNEL_LAUNCH_CHECK();
+                          });
+
+    return output;
+}
diff --git a/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.h b/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+at::Tensor
+two_times(const at::Tensor& input);
diff --git a/tests/test_finder.py b/tests/test_finder.py
@@ -290,6 +290,34 @@ def test_torch_subdirectory(shared_datadir: pathlib.Path, tmp_path: pathlib.Path
     assert torch.equal(t_output, 2 * t_input)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+def test_torch_cuda_subdirectory(shared_datadir: pathlib.Path, tmp_path: pathlib.Path) -> None:
+    project_directory = shared_datadir / "torch_cuda_subdirectory"
+    build_directory = tmp_path / "build"
+
+    charonload.module_config["test_torch_cuda_subdirectory"] = charonload.Config(
+        project_directory,
+        build_directory,
+        stubs_directory=VSCODE_STUBS_DIRECTORY,
+    )
+
+    import test_torch_cuda_subdirectory as test_torch
+
+    t_input = torch.randint(0, 10, size=(3, 3, 3), dtype=torch.float, device="cuda")
+    t_output = test_torch.two_times(t_input)
+
+    assert t_output.device == t_input.device
+    assert t_output.shape == t_input.shape
+    assert torch.equal(t_output, 2 * t_input)
+
+    t_input = torch.randint(0, 10, size=(3, 3, 3), dtype=torch.float, device="cuda")
+    t_output = test_torch.three_times(t_input)
+
+    assert t_output.device == t_input.device
+    assert t_output.shape == t_input.shape
+    assert torch.equal(t_output, 3 * t_input)
+
+
 def test_torch_import_twice(shared_datadir: pathlib.Path, tmp_path: pathlib.Path) -> None:
     project_directory = shared_datadir / "torch_cpu"
     build_directory = tmp_path / "build"