Skip to content

Commit 31ba49c

Browse files
authored
Implement CUDA backend for parallel cuda::std::for_each (#5610)
* Refactor our execution policies We currently tag our execution policies with just an enumeration for that represents the standard execution policies. However, this is not sufficient for our use cases, because we also want to pass along the execution backend and the memory_direction of the algorithm. This changes our policies so that they take an unsigned integer instead of an enumeration and then adds facilities to set and get the respective properties * Implement parallel `cuda::std::for_each` * Do not consider `_Backend` when comparing execution policies * Only have once source of truth for the backend extraction
1 parent 1f5903d commit 31ba49c

File tree

20 files changed

+916
-80
lines changed

20 files changed

+916
-80
lines changed

cub/cub/device/dispatch/dispatch_streaming_reduce.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include <cub/device/dispatch/dispatch_reduce.cuh>
1717
#include <cub/iterator/arg_index_input_iterator.cuh>
1818

19-
#include <thrust/iterator/constant_iterator.h>
2019
#include <thrust/iterator/iterator_adaptor.h>
2120

2221
#include <cuda/__iterator/tabulate_output_iterator.h>

cudax/include/cuda/experimental/__execution/policy.cuh

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -44,48 +44,48 @@ struct any_execution_policy
4444

4545
_CCCL_HIDE_FROM_ABI any_execution_policy() = default;
4646

47-
template <__execution_policy _Policy>
48-
_CCCL_HOST_API constexpr any_execution_policy(::cuda::std::execution::__policy<_Policy>) noexcept
49-
: value(_Policy)
47+
template <uint32_t _Policy>
48+
_CCCL_HOST_API constexpr any_execution_policy(::cuda::std::execution::__execution_policy_base<_Policy>) noexcept
49+
: value(value_type{_Policy})
5050
{}
5151

5252
_CCCL_HOST_API constexpr operator __execution_policy() const noexcept
5353
{
5454
return value;
5555
}
5656

57-
_CCCL_HOST_API constexpr auto operator()() const noexcept -> __execution_policy
57+
_CCCL_HOST_API constexpr auto operator()() const noexcept -> value_type
5858
{
5959
return value;
6060
}
6161

62-
template <__execution_policy _Policy>
62+
template <uint32_t _Policy>
6363
[[nodiscard]] _CCCL_HOST_API friend constexpr bool
64-
operator==(const any_execution_policy& pol, const ::cuda::std::execution::__policy<_Policy>&) noexcept
64+
operator==(const any_execution_policy& pol, const ::cuda::std::execution::__execution_policy_base<_Policy>&) noexcept
6565
{
66-
return pol.value == _Policy;
66+
return pol.value == value_type{_Policy};
6767
}
6868

6969
#if _CCCL_STD_VER <= 2017
70-
template <__execution_policy _Policy>
70+
template <uint32_t _Policy>
7171
[[nodiscard]] _CCCL_HOST_API friend constexpr bool
72-
operator==(const ::cuda::std::execution::__policy<_Policy>&, const any_execution_policy& pol) noexcept
72+
operator==(const ::cuda::std::execution::__execution_policy_base<_Policy>&, const any_execution_policy& pol) noexcept
7373
{
74-
return pol.value == _Policy;
74+
return pol.value == value_type{_Policy};
7575
}
7676

77-
template <__execution_policy _Policy>
77+
template <uint32_t _Policy>
7878
[[nodiscard]] _CCCL_HOST_API friend constexpr bool
79-
operator!=(const any_execution_policy& pol, const ::cuda::std::execution::__policy<_Policy>&) noexcept
79+
operator!=(const any_execution_policy& pol, const ::cuda::std::execution::__execution_policy_base<_Policy>&) noexcept
8080
{
81-
return pol.value != _Policy;
81+
return pol.value != value_type{_Policy};
8282
}
8383

84-
template <__execution_policy _Policy>
84+
template <uint32_t _Policy>
8585
[[nodiscard]] _CCCL_HOST_API friend constexpr bool
86-
operator!=(const ::cuda::std::execution::__policy<_Policy>&, const any_execution_policy& pol)
86+
operator!=(const ::cuda::std::execution::__execution_policy_base<_Policy>&, const any_execution_policy& pol)
8787
{
88-
return pol.value != _Policy;
88+
return pol.value != value_type{_Policy};
8989
}
9090
#endif // _CCCL_STD_VER <= 2017
9191

libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,5 +53,7 @@ function(libcudacxx_build_compiler_targets)
5353
# order matters here, we need the libcudacxx options to override the cccl options.
5454
cccl.compiler_interface
5555
libcudacxx.compiler_flags
56+
Thrust::Thrust
57+
CUB::CUB
5658
)
5759
endfunction()
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
#ifndef _CUDA___EXECUTION_POLICY_H
11+
#define _CUDA___EXECUTION_POLICY_H
12+
13+
#include <cuda/std/detail/__config>
14+
15+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16+
# pragma GCC system_header
17+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18+
# pragma clang system_header
19+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20+
# pragma system_header
21+
#endif // no system header
22+
23+
#if _CCCL_HAS_BACKEND_CUDA()
24+
25+
# include <cuda/__fwd/execution_policy.h>
26+
# include <cuda/std/__execution/policy.h>
27+
# include <cuda/std/__type_traits/is_execution_policy.h>
28+
29+
# include <cuda/std/__cccl/prologue.h>
30+
31+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
32+
33+
template <uint32_t _Policy>
34+
struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda>
35+
: __execution_policy_base<_Policy, __execution_backend::__none>
36+
{};
37+
38+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
39+
40+
_CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION
41+
42+
using __cub_parallel_unsequenced_policy =
43+
::cuda::std::execution::__execution_policy_base<::cuda::std::execution::__with_cuda_backend<static_cast<uint32_t>(
44+
::cuda::std::execution::__execution_policy::__parallel_unsequenced)>()>;
45+
_CCCL_GLOBAL_CONSTANT __cub_parallel_unsequenced_policy __cub_par_unseq{};
46+
47+
_CCCL_END_NAMESPACE_CUDA_EXECUTION
48+
49+
# include <cuda/std/__cccl/epilogue.h>
50+
51+
#endif // _CCCL_HAS_BACKEND_CUDA()
52+
53+
#endif // _CUDA___EXECUTION_POLICY_H
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
#ifndef _CUDA___FWD_EXECUTION_POLICY_H
11+
#define _CUDA___FWD_EXECUTION_POLICY_H
12+
13+
#include <cuda/std/detail/__config>
14+
15+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16+
# pragma GCC system_header
17+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18+
# pragma clang system_header
19+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20+
# pragma system_header
21+
#endif // no system header
22+
23+
#if _CCCL_HAS_BACKEND_CUDA()
24+
25+
# include <cuda/std/__fwd/execution_policy.h>
26+
27+
# include <cuda/std/__cccl/prologue.h>
28+
29+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
30+
31+
//! @brief Sets the execution backend to cuda
32+
template <uint32_t _Policy>
33+
[[nodiscard]] _CCCL_API constexpr uint32_t __with_cuda_backend() noexcept
34+
{
35+
constexpr uint32_t __backend_mask{0xFFFF00FF};
36+
constexpr uint32_t __new_policy =
37+
(_Policy & __backend_mask) | (static_cast<uint32_t>(__execution_backend::__cuda) << 8);
38+
return __new_policy;
39+
}
40+
41+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
42+
43+
# include <cuda/std/__cccl/epilogue.h>
44+
45+
#endif // _CCCL_HAS_BACKEND_CUDA()
46+
47+
#endif // _CUDA___FWD_EXECUTION_POLICY_H

libcudacxx/include/cuda/std/__execution/policy.h

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,65 +20,67 @@
2020
# pragma system_header
2121
#endif // no system header
2222

23-
#include <cuda/std/__type_traits/underlying_type.h>
23+
#include <cuda/std/__bit/has_single_bit.h>
24+
#include <cuda/std/__fwd/execution_policy.h>
2425
#include <cuda/std/cstdint>
2526

2627
#include <cuda/std/__cccl/prologue.h>
2728

2829
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
2930

30-
enum class __execution_policy : uint32_t
31+
[[nodiscard]] _CCCL_API constexpr bool __has_unique_backend(const __execution_backend __backends) noexcept
3132
{
32-
__invalid_execution_policy = 0,
33-
__sequenced = 1 << 0,
34-
__parallel = 1 << 1,
35-
__unsequenced = 1 << 2,
36-
__parallel_unsequenced = __execution_policy::__parallel | __execution_policy::__unsequenced,
37-
};
38-
39-
[[nodiscard]] _CCCL_API constexpr bool
40-
__satisfies_execution_policy(__execution_policy __lhs, __execution_policy __rhs) noexcept
41-
{
42-
return (static_cast<uint32_t>(__lhs) & static_cast<uint32_t>(__rhs)) != 0;
33+
return ::cuda::std::has_single_bit(static_cast<uint32_t>(__backends));
4334
}
4435

45-
template <__execution_policy _Policy>
46-
struct __policy
36+
//! @brief Base class for our execution policies.
37+
//! It takes an untagged uint32_t because we want to be able to store 3 different enumerations in it.
38+
template <uint32_t _Policy, __execution_backend _Backend>
39+
struct __execution_policy_base
4740
{
48-
template <__execution_policy _OtherPolicy>
49-
[[nodiscard]] _CCCL_API friend constexpr bool operator==(const __policy&, const __policy<_OtherPolicy>&) noexcept
41+
//! @brief Tag that identifies this and all derived classes as a CCCL execution policy
42+
static constexpr uint32_t __cccl_policy_ = _Policy;
43+
44+
template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
45+
[[nodiscard]] _CCCL_API friend constexpr bool
46+
operator==(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept
5047
{
51-
using __underlying_t = underlying_type_t<__execution_policy>;
52-
return (static_cast<__underlying_t>(_Policy) == static_cast<__underlying_t>(_OtherPolicy));
48+
return _Policy == _OtherPolicy;
5349
}
5450

5551
#if _CCCL_STD_VER <= 2017
56-
template <__execution_policy _OtherPolicy>
57-
[[nodiscard]] _CCCL_API friend constexpr bool operator!=(const __policy&, const __policy<_OtherPolicy>&) noexcept
52+
template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
53+
[[nodiscard]] _CCCL_API friend constexpr bool
54+
operator!=(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept
5855
{
59-
using __underlying_t = underlying_type_t<__execution_policy>;
60-
return (static_cast<__underlying_t>(_Policy) != static_cast<__underlying_t>(_OtherPolicy));
56+
return _Policy != _OtherPolicy;
6157
}
6258
#endif // _CCCL_STD_VER <= 2017
6359

64-
static constexpr __execution_policy __policy_ = _Policy;
65-
};
60+
//! @brief Extracts the execution policy from the stored _Policy
61+
[[nodiscard]] _CCCL_API static constexpr __execution_policy __get_policy() noexcept
62+
{
63+
return __policy_to_execution_policy<_Policy>;
64+
}
6665

67-
struct sequenced_policy : public __policy<__execution_policy::__sequenced>
68-
{};
66+
//! @brief Extracts the execution backend from the stored _Policy
67+
[[nodiscard]] _CCCL_API static constexpr __execution_backend __get_backend() noexcept
68+
{
69+
return __policy_to_execution_backend<_Policy>;
70+
}
71+
};
6972

73+
using sequenced_policy = __execution_policy_base<static_cast<uint32_t>(__execution_policy::__sequenced)>;
7074
_CCCL_GLOBAL_CONSTANT sequenced_policy seq{};
7175

72-
struct parallel_policy : public __policy<__execution_policy::__parallel>
73-
{};
76+
using parallel_policy = __execution_policy_base<static_cast<uint32_t>(__execution_policy::__parallel)>;
7477
_CCCL_GLOBAL_CONSTANT parallel_policy par{};
7578

76-
struct parallel_unsequenced_policy : public __policy<__execution_policy::__parallel_unsequenced>
77-
{};
79+
using parallel_unsequenced_policy =
80+
__execution_policy_base<static_cast<uint32_t>(__execution_policy::__parallel_unsequenced)>;
7881
_CCCL_GLOBAL_CONSTANT parallel_unsequenced_policy par_unseq{};
7982

80-
struct unsequenced_policy : public __policy<__execution_policy::__unsequenced>
81-
{};
83+
using unsequenced_policy = __execution_policy_base<static_cast<uint32_t>(__execution_policy::__unsequenced)>;
8284
_CCCL_GLOBAL_CONSTANT unsequenced_policy unseq{};
8385

8486
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___FWD_EXECUTION_POLICY_H
12+
#define _CUDA_STD___FWD_EXECUTION_POLICY_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#include <cuda/std/cstdint>
25+
26+
#include <cuda/std/__cccl/prologue.h>
27+
28+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
29+
30+
//! @brief Enumerates the standard execution policies
31+
enum class __execution_policy : uint8_t
32+
{
33+
__invalid_execution_policy = 0,
34+
__sequenced = 1 << 0,
35+
__parallel = 1 << 1,
36+
__unsequenced = 1 << 2,
37+
__parallel_unsequenced = __execution_policy::__parallel | __execution_policy::__unsequenced,
38+
};
39+
40+
//! @brief Extracts the execution policy from the stored _Policy
41+
template <uint32_t _Policy>
42+
inline constexpr __execution_policy __policy_to_execution_policy = __execution_policy{(_Policy & uint32_t{0x000000FF})};
43+
44+
//! @brief Enumerates the different backends we support
45+
//! @note Not an enum class because a user might specify multiple backends
46+
enum __execution_backend : uint8_t
47+
{
48+
// The backends we provide
49+
__none = 0,
50+
#if _CCCL_HAS_BACKEND_CUDA()
51+
__cuda = 1 << 1,
52+
#endif // _CCCL_HAS_BACKEND_CUDA()
53+
#if _CCCL_HAS_BACKEND_OMP()
54+
__omp = 1 << 2,
55+
#endif // _CCCL_HAS_BACKEND_OMP()
56+
#if _CCCL_HAS_BACKEND_TBB()
57+
__tbb = 1 << 3,
58+
#endif // _CCCL_HAS_BACKEND_TBB()
59+
};
60+
61+
//! @brief Extracts the execution backend from the stored _Policy
62+
template <uint32_t _Policy>
63+
inline constexpr __execution_backend __policy_to_execution_backend =
64+
__execution_backend{(_Policy & uint32_t{0x0000FF00}) >> 8};
65+
66+
template <uint32_t _Policy, __execution_backend _Backend = __policy_to_execution_backend<_Policy>>
67+
struct __execution_policy_base;
68+
69+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
70+
71+
#include <cuda/std/__cccl/epilogue.h>
72+
73+
#endif // _CUDA_STD___FWD_EXECUTION_POLICY_H
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___INTERNAL_PSTL_CONFIG_H
12+
#define _CUDA_STD___INTERNAL_PSTL_CONFIG_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#include <cuda/std/__cccl/prologue.h>
25+
26+
#define _CCCL_HAS_BACKEND_CUDA() _CCCL_CUDA_COMPILATION() && !_CCCL_COMPILER(NVRTC)
27+
#define _CCCL_HAS_BACKEND_OMP() 0
28+
#define _CCCL_HAS_BACKEND_TBB() 0
29+
30+
#include <cuda/std/__cccl/epilogue.h>
31+
32+
#endif // _CUDA_STD___INTERNAL_PSTL_CONFIG_H

0 commit comments

Comments
 (0)