Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,3 @@ endif()
list(APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/charonload")

find_package(charonload REQUIRED GLOBAL)


set(MODIFIED_CMAKE_VARIABLES
# Set by Caffe2/public/cuda.cmake as part of TorchConfig.cmake
"CMAKE_CUDA_FLAGS"
"CMAKE_CUDA_FLAGS_DEBUG"
"CMAKE_CUDA_FLAGS_MINSIZEREL"
"CMAKE_CUDA_FLAGS_RELEASE"
"CMAKE_CUDA_FLAGS_RELWITHDEBINFO"
)

foreach(var IN LISTS MODIFIED_CMAKE_VARIABLES)
set(${var} ${${var}} PARENT_SCOPE)
endforeach()
11 changes: 11 additions & 0 deletions docs/src/pytorch_handling.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ Set required PIC flag for linking.

::::

::::{grid-item-card}
:link: pytorch_handling/cuda_flags
:link-type: doc

**CUDA Flags**
^^^
Set required CUDA flags for parents and siblings.

::::

:::::


Expand All @@ -49,4 +59,5 @@ Set required PIC flag for linking.
pytorch_handling/cpp_standard
pytorch_handling/cpp11_abi
pytorch_handling/position_independent_code
pytorch_handling/cuda_flags
```
13 changes: 13 additions & 0 deletions docs/src/pytorch_handling/cuda_flags.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# CUDA Flags

In order to simplify writing CUDA kernels, the PyTorch C++ library enables several compiler flags:

- Using CUDA architectures of detected GPUs
- Enabling `__host__ __device__` lambda functions, e.g., with thrust/CUB algorithms
- Enabling relaxed `constexpr` rules to reuse, e.g., `std::clamp` in kernels and `__device__` functions
- Suppressing some noisy warnings

However, the PyTorch C++ library provides these flags by modifying the (old-school) [``CUDA_NVCC_FLAGS``](<inv:cmake.org#module/FindCUDA>) variable. Although CMake will pick up the variable, the modifications are **only** visible in the directory (and subdirectory) scope(s) where PyTorch has been found by [``find_package``](<inv:cmake.org#command/find_package>). This may lead to compiler errors for depending targets in parent or sibling directories when finding PyTorch with the ``GLOBAL`` option enabled, as this promotes **only** the respective targets to all scopes but leaves the variables modifications in the calling scope.


Charonload automatically detects the modified compile flags and attaches them as an `INTERFACE` property to the CUDA target of the PyTorch C++ library, such that they will be correctly propagated to any linking target.
19 changes: 18 additions & 1 deletion src/charonload/cmake/charonload-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,25 @@ if(charonload_FIND_QUIETLY)
set(CUDNN_FIND_QUIETLY 1)
endif()

# Back up CUDA_NVCC_FLAGS for later restoring
set(CHARONLOAD_CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS})

find_dependency(Torch)

list(POP_BACK CMAKE_MESSAGE_INDENT)

if(Torch_FOUND)
# 1. CUDA flag patching
if(NOT CHARONLOAD_CUDA_NVCC_FLAGS STREQUAL CUDA_NVCC_FLAGS AND TARGET torch_cuda)
# Use modified CUDA_NVCC_FLAGS
target_compile_options(torch_cuda INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS}>)

# Restore CUDA_NVCC_FLAGS
set(CUDA_NVCC_FLAGS ${CHARONLOAD_CUDA_NVCC_FLAGS})
message(STATUS "Patched target \"torch_cuda\" with modified \"CUDA_NVCC_FLAGS\" settings and rolled back the variable modifications.")
endif()

# 2. Python bindings library
get_target_property(TORCH_LIBRARY_LOCATION torch LOCATION)
get_filename_component(TORCH_LIB_SEARCH_PATH ${TORCH_LIBRARY_LOCATION} DIRECTORY)

Expand All @@ -102,7 +118,8 @@ if(Torch_FOUND)
endif()
endif()

list(POP_BACK CMAKE_MESSAGE_INDENT)
# Clean up backup variable
unset(CHARONLOAD_CUDA_NVCC_FLAGS)


include("${CMAKE_CURRENT_LIST_DIR}/torch/cxx_standard.cmake")
Expand Down
4 changes: 4 additions & 0 deletions tests/data/torch_cuda/two_times_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#include <ATen/ops/zeros_like.h>
#include <c10/cuda/CUDAException.h>

#ifndef __CUDACC_EXTENDED_LAMBDA__
#error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated"
#endif

template <class T>
__global__ void
two_times_kernel(const T* const input, T* const output, const std::size_t N)
Expand Down
8 changes: 8 additions & 0 deletions tests/data/torch_cuda_subdirectory/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cmake_minimum_required(VERSION 3.27)

project(torch_cuda_subdirectory LANGUAGES CXX CUDA)

add_subdirectory(lib_dir)

# Must come AFTER lib_dir
add_subdirectory(binding_dir)
9 changes: 9 additions & 0 deletions tests/data/torch_cuda_subdirectory/binding_dir/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This short-circuits from GLOBAL find_package
find_package(charonload)

if(charonload_FOUND)
charonload_add_torch_library(${TORCH_EXTENSION_NAME} MODULE)

target_sources(${TORCH_EXTENSION_NAME} PRIVATE bindings.cpp three_times_cuda.cu)
target_link_libraries(${TORCH_EXTENSION_NAME} PRIVATE torch_cuda_subdirectory)
endif()
40 changes: 40 additions & 0 deletions tests/data/torch_cuda_subdirectory/binding_dir/bindings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include <torch/python.h>

#include "three_times_cuda.h"
#include "two_times_cuda.h"

using namespace pybind11::literals;

#define STRINGIFY_IMPL(x) #x
#define STRINGIFY(a) STRINGIFY_IMPL(a)

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.doc() = "A C++/CUDA extension module named \"" STRINGIFY(TORCH_EXTENSION_NAME) "\" that is built just-in-time.";

m.def("two_times", &two_times, "input"_a, R"(
Multiply the given input tensor by a factor of 2 on the GPU using CUDA.

Parameters
----------
input
A tensor with arbitrary shape and dtype.

Returns
-------
A new tensor with the same shape and dtype as ``input`` and where each value is multiplied by 2.
)");

m.def("three_times", &three_times, "input"_a, R"(
Multiply the given input tensor by a factor of 3 on the GPU using CUDA.

Parameters
----------
input
A tensor with arbitrary shape and dtype.

Returns
-------
A new tensor with the same shape and dtype as ``input`` and where each value is multiplied by 3.
)");
}
37 changes: 37 additions & 0 deletions tests/data/torch_cuda_subdirectory/binding_dir/three_times_cuda.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include <ATen/Dispatch.h>
#include <ATen/ops/zeros_like.h>
#include <c10/cuda/CUDAException.h>

#ifndef __CUDACC_EXTENDED_LAMBDA__
#error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated"
#endif

template <class T>
__global__ void
three_times_kernel(const T* const input, T* const output, const std::size_t N)
{
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
output[i] = T(3) * input[i];
}
}

at::Tensor
three_times(const at::Tensor& input)
{
auto output = at::zeros_like(input);

AT_DISPATCH_ALL_TYPES(input.scalar_type(),
"three_times_kernel",
[&]()
{
const std::uint32_t block_size = 128;
const std::uint32_t num_blocks = (input.numel() + block_size - 1) / block_size;
three_times_kernel<<<num_blocks, block_size>>>(input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
input.numel());
C10_CUDA_KERNEL_LAUNCH_CHECK();
});

return output;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#pragma once

#include <ATen/core/Tensor.h>

at::Tensor
three_times(const at::Tensor& input);
9 changes: 9 additions & 0 deletions tests/data/torch_cuda_subdirectory/lib_dir/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Use GLOBAL import to force short-cicuiting
find_package(charonload GLOBAL)

if(charonload_FOUND)
charonload_add_torch_library(torch_cuda_subdirectory STATIC)

target_include_directories(torch_cuda_subdirectory PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
target_sources(torch_cuda_subdirectory PRIVATE two_times_cuda.cu)
endif()
37 changes: 37 additions & 0 deletions tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include <ATen/Dispatch.h>
#include <ATen/ops/zeros_like.h>
#include <c10/cuda/CUDAException.h>

#ifndef __CUDACC_EXTENDED_LAMBDA__
#error "Modified CUDA_NVCC_FLAGS (extended lambda) from torch not correctly propagated"
#endif

template <class T>
__global__ void
two_times_kernel(const T* const input, T* const output, const std::size_t N)
{
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
output[i] = T(2) * input[i];
}
}

at::Tensor
two_times(const at::Tensor& input)
{
auto output = at::zeros_like(input);

AT_DISPATCH_ALL_TYPES(input.scalar_type(),
"two_times_cuda",
[&]()
{
const std::uint32_t block_size = 128;
const std::uint32_t num_blocks = (input.numel() + block_size - 1) / block_size;
two_times_kernel<<<num_blocks, block_size>>>(input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
input.numel());
C10_CUDA_KERNEL_LAUNCH_CHECK();
});

return output;
}
6 changes: 6 additions & 0 deletions tests/data/torch_cuda_subdirectory/lib_dir/two_times_cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#pragma once

#include <ATen/core/Tensor.h>

at::Tensor
two_times(const at::Tensor& input);
28 changes: 28 additions & 0 deletions tests/test_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,34 @@ def test_torch_subdirectory(shared_datadir: pathlib.Path, tmp_path: pathlib.Path
assert torch.equal(t_output, 2 * t_input)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
def test_torch_cuda_subdirectory(shared_datadir: pathlib.Path, tmp_path: pathlib.Path) -> None:
project_directory = shared_datadir / "torch_cuda_subdirectory"
build_directory = tmp_path / "build"

charonload.module_config["test_torch_cuda_subdirectory"] = charonload.Config(
project_directory,
build_directory,
stubs_directory=VSCODE_STUBS_DIRECTORY,
)

import test_torch_cuda_subdirectory as test_torch

t_input = torch.randint(0, 10, size=(3, 3, 3), dtype=torch.float, device="cuda")
t_output = test_torch.two_times(t_input)

assert t_output.device == t_input.device
assert t_output.shape == t_input.shape
assert torch.equal(t_output, 2 * t_input)

t_input = torch.randint(0, 10, size=(3, 3, 3), dtype=torch.float, device="cuda")
t_output = test_torch.three_times(t_input)

assert t_output.device == t_input.device
assert t_output.shape == t_input.shape
assert torch.equal(t_output, 3 * t_input)


def test_torch_import_twice(shared_datadir: pathlib.Path, tmp_path: pathlib.Path) -> None:
project_directory = shared_datadir / "torch_cpu"
build_directory = tmp_path / "build"
Expand Down