diff --git a/CMakeLists.txt b/CMakeLists.txt index 64e6820..c957f7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,17 +11,3 @@ endif() list(APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/charonload") find_package(charonload REQUIRED GLOBAL) - - -set(MODIFIED_CMAKE_VARIABLES - # Set by Caffe2/public/cuda.cmake as part of TorchConfig.cmake - "CMAKE_CUDA_FLAGS" - "CMAKE_CUDA_FLAGS_DEBUG" - "CMAKE_CUDA_FLAGS_MINSIZEREL" - "CMAKE_CUDA_FLAGS_RELEASE" - "CMAKE_CUDA_FLAGS_RELWITHDEBINFO" -) - -foreach(var IN LISTS MODIFIED_CMAKE_VARIABLES) - set(${var} ${${var}} PARENT_SCOPE) -endforeach() diff --git a/docs/src/pytorch_handling.md b/docs/src/pytorch_handling.md index dc0c1e0..7c41ccb 100644 --- a/docs/src/pytorch_handling.md +++ b/docs/src/pytorch_handling.md @@ -40,6 +40,16 @@ Set required PIC flag for linking. :::: +::::{grid-item-card} +:link: pytorch_handling/cuda_flags +:link-type: doc + +**CUDA Flags** +^^^ +Set required CUDA flags for parents and siblings. + +:::: + ::::: @@ -49,4 +59,5 @@ Set required PIC flag for linking. pytorch_handling/cpp_standard pytorch_handling/cpp11_abi pytorch_handling/position_independent_code +pytorch_handling/cuda_flags ``` diff --git a/docs/src/pytorch_handling/cuda_flags.md b/docs/src/pytorch_handling/cuda_flags.md new file mode 100644 index 0000000..96cf214 --- /dev/null +++ b/docs/src/pytorch_handling/cuda_flags.md @@ -0,0 +1,13 @@ +# CUDA Flags + +In order to simplify writing CUDA kernels, the PyTorch C++ library enables several compiler flags: + +- Using CUDA architectures of detected GPUs +- Enabling `__host__ __device__` lambda functions, e.g., with thrust/CUB algorithms +- Enabling relaxed `constexpr` rules to reuse, e.g., `std::clamp` in kernels and `__device__` functions +- Suppressing some noisy warnings + +However, the PyTorch C++ library provides these flags by modifying the (old-school) [``CUDA_NVCC_FLAGS``]() variable. Although CMake will pick up the variable, the modifications are **only** visible in the directory (and subdirectory) scope(s) where PyTorch has been found by [``find_package``](). This may lead to compiler errors for depending targets in parent or sibling directories when finding PyTorch with the ``GLOBAL`` option enabled, as this promotes **only** the respective targets to all scopes but leaves the variables modifications in the calling scope. + + +Charonload automatically detects the modified compile flags and attaches them as an `INTERFACE` property to the CUDA target of the PyTorch C++ library, such that they will be correctly propagated to any linking target. diff --git a/src/charonload/cmake/charonload-config.cmake b/src/charonload/cmake/charonload-config.cmake index 7b074d5..c7ff7d8 100644 --- a/src/charonload/cmake/charonload-config.cmake +++ b/src/charonload/cmake/charonload-config.cmake @@ -81,9 +81,25 @@ if(charonload_FIND_QUIETLY) set(CUDNN_FIND_QUIETLY 1) endif() +# Back up CUDA_NVCC_FLAGS for later restoring +set(CHARONLOAD_CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}) + find_dependency(Torch) +list(POP_BACK CMAKE_MESSAGE_INDENT) + if(Torch_FOUND) + # 1. CUDA flag patching + if(NOT CHARONLOAD_CUDA_NVCC_FLAGS STREQUAL CUDA_NVCC_FLAGS AND TARGET torch_cuda) + # Use modified CUDA_NVCC_FLAGS + target_compile_options(torch_cuda INTERFACE $<$:${CUDA_NVCC_FLAGS}>) + + # Restore CUDA_NVCC_FLAGS + set(CUDA_NVCC_FLAGS ${CHARONLOAD_CUDA_NVCC_FLAGS}) + message(STATUS "Patched target \"torch_cuda\" with modified \"CUDA_NVCC_FLAGS\" settings and rolled back the variable modifications.") + endif() + + # 2. Python bindings library get_target_property(TORCH_LIBRARY_LOCATION torch LOCATION) get_filename_component(TORCH_LIB_SEARCH_PATH ${TORCH_LIBRARY_LOCATION} DIRECTORY) @@ -102,7 +118,8 @@ if(Torch_FOUND) endif() endif() -list(POP_BACK CMAKE_MESSAGE_INDENT) +# Clean up backup variable +unset(CHARONLOAD_CUDA_NVCC_FLAGS) include("${CMAKE_CURRENT_LIST_DIR}/torch/cxx_standard.cmake") diff --git a/tests/data/torch_cuda/two_times_cuda.cu b/tests/data/torch_cuda/two_times_cuda.cu index 2d05adb..7a01bf4 100644 --- a/tests/data/torch_cuda/two_times_cuda.cu +++ b/tests/data/torch_cuda/two_times_cuda.cu @@ -2,6 +2,10 @@ #include #include +#ifndef __CUDACC_EXTENDED_LAMBDA__ + #error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated" +#endif + template __global__ void two_times_kernel(const T* const input, T* const output, const std::size_t N) diff --git a/tests/data/torch_cuda_subdirectory/CMakeLists.txt b/tests/data/torch_cuda_subdirectory/CMakeLists.txt new file mode 100644 index 0000000..39315e6 --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.27) + +project(torch_cuda_subdirectory LANGUAGES CXX CUDA) + +add_subdirectory(lib_dir) + +# Must come AFTER lib_dir +add_subdirectory(binding_dir) diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/CMakeLists.txt b/tests/data/torch_cuda_subdirectory/binding_dir/CMakeLists.txt new file mode 100644 index 0000000..aaff444 --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/binding_dir/CMakeLists.txt @@ -0,0 +1,9 @@ +# This short-circuits from GLOBAL find_package +find_package(charonload) + +if(charonload_FOUND) + charonload_add_torch_library(${TORCH_EXTENSION_NAME} MODULE) + + target_sources(${TORCH_EXTENSION_NAME} PRIVATE bindings.cpp three_times_cuda.cu) + target_link_libraries(${TORCH_EXTENSION_NAME} PRIVATE torch_cuda_subdirectory) +endif() diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/bindings.cpp b/tests/data/torch_cuda_subdirectory/binding_dir/bindings.cpp new file mode 100644 index 0000000..48c199f --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/binding_dir/bindings.cpp @@ -0,0 +1,40 @@ +#include + +#include "three_times_cuda.h" +#include "two_times_cuda.h" + +using namespace pybind11::literals; + +#define STRINGIFY_IMPL(x) #x +#define STRINGIFY(a) STRINGIFY_IMPL(a) + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.doc() = "A C++/CUDA extension module named \"" STRINGIFY(TORCH_EXTENSION_NAME) "\" that is built just-in-time."; + + m.def("two_times", &two_times, "input"_a, R"( + Multiply the given input tensor by a factor of 2 on the GPU using CUDA. + + Parameters + ---------- + input + A tensor with arbitrary shape and dtype. + + Returns + ------- + A new tensor with the same shape and dtype as ``input`` and where each value is multiplied by 2. + )"); + + m.def("three_times", &three_times, "input"_a, R"( + Multiply the given input tensor by a factor of 3 on the GPU using CUDA. + + Parameters + ---------- + input + A tensor with arbitrary shape and dtype. + + Returns + ------- + A new tensor with the same shape and dtype as ``input`` and where each value is multiplied by 3. + )"); +} diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.cu b/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.cu new file mode 100644 index 0000000..c9dad79 --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.cu @@ -0,0 +1,37 @@ +#include +#include +#include + +#ifndef __CUDACC_EXTENDED_LAMBDA__ + #error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated" +#endif + +template +__global__ void +three_times_kernel(const T* const input, T* const output, const std::size_t N) +{ + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) + { + output[i] = T(3) * input[i]; + } +} + +at::Tensor +three_times(const at::Tensor& input) +{ + auto output = at::zeros_like(input); + + AT_DISPATCH_ALL_TYPES(input.scalar_type(), + "three_times_kernel", + [&]() + { + const std::uint32_t block_size = 128; + const std::uint32_t num_blocks = (input.numel() + block_size - 1) / block_size; + three_times_kernel<<>>(input.data_ptr(), + output.data_ptr(), + input.numel()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + + return output; +} diff --git a/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.h b/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.h new file mode 100644 index 0000000..410376d --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.h @@ -0,0 +1,6 @@ +#pragma once + +#include + +at::Tensor +three_times(const at::Tensor& input); diff --git a/tests/data/torch_cuda_subdirectory/lib_dir/CMakeLists.txt b/tests/data/torch_cuda_subdirectory/lib_dir/CMakeLists.txt new file mode 100644 index 0000000..9abbb59 --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/lib_dir/CMakeLists.txt @@ -0,0 +1,9 @@ +# Use GLOBAL import to force short-cicuiting +find_package(charonload GLOBAL) + +if(charonload_FOUND) + charonload_add_torch_library(torch_cuda_subdirectory STATIC) + + target_include_directories(torch_cuda_subdirectory PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}") + target_sources(torch_cuda_subdirectory PRIVATE two_times_cuda.cu) +endif() diff --git a/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.cu b/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.cu new file mode 100644 index 0000000..7a01bf4 --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.cu @@ -0,0 +1,37 @@ +#include +#include +#include + +#ifndef __CUDACC_EXTENDED_LAMBDA__ + #error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated" +#endif + +template +__global__ void +two_times_kernel(const T* const input, T* const output, const std::size_t N) +{ + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) + { + output[i] = T(2) * input[i]; + } +} + +at::Tensor +two_times(const at::Tensor& input) +{ + auto output = at::zeros_like(input); + + AT_DISPATCH_ALL_TYPES(input.scalar_type(), + "two_times_cuda", + [&]() + { + const std::uint32_t block_size = 128; + const std::uint32_t num_blocks = (input.numel() + block_size - 1) / block_size; + two_times_kernel<<>>(input.data_ptr(), + output.data_ptr(), + input.numel()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + + return output; +} diff --git a/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.h b/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.h new file mode 100644 index 0000000..65d5b6b --- /dev/null +++ b/tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.h @@ -0,0 +1,6 @@ +#pragma once + +#include + +at::Tensor +two_times(const at::Tensor& input); diff --git a/tests/test_finder.py b/tests/test_finder.py index c7f0b75..95d3dca 100644 --- a/tests/test_finder.py +++ b/tests/test_finder.py @@ -290,6 +290,34 @@ def test_torch_subdirectory(shared_datadir: pathlib.Path, tmp_path: pathlib.Path assert torch.equal(t_output, 2 * t_input) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_torch_cuda_subdirectory(shared_datadir: pathlib.Path, tmp_path: pathlib.Path) -> None: + project_directory = shared_datadir / "torch_cuda_subdirectory" + build_directory = tmp_path / "build" + + charonload.module_config["test_torch_cuda_subdirectory"] = charonload.Config( + project_directory, + build_directory, + stubs_directory=VSCODE_STUBS_DIRECTORY, + ) + + import test_torch_cuda_subdirectory as test_torch + + t_input = torch.randint(0, 10, size=(3, 3, 3), dtype=torch.float, device="cuda") + t_output = test_torch.two_times(t_input) + + assert t_output.device == t_input.device + assert t_output.shape == t_input.shape + assert torch.equal(t_output, 2 * t_input) + + t_input = torch.randint(0, 10, size=(3, 3, 3), dtype=torch.float, device="cuda") + t_output = test_torch.three_times(t_input) + + assert t_output.device == t_input.device + assert t_output.shape == t_input.shape + assert torch.equal(t_output, 3 * t_input) + + def test_torch_import_twice(shared_datadir: pathlib.Path, tmp_path: pathlib.Path) -> None: project_directory = shared_datadir / "torch_cpu" build_directory = tmp_path / "build"