Skip to content

Commit bba027b

Browse files
committed
Implement parallel cuda::std::for_each
1 parent f54ed97 commit bba027b

File tree

18 files changed

+835
-39
lines changed

18 files changed

+835
-39
lines changed

cub/cub/device/dispatch/dispatch_streaming_reduce.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include <cub/device/dispatch/dispatch_reduce.cuh>
1717
#include <cub/iterator/arg_index_input_iterator.cuh>
1818

19-
#include <thrust/iterator/constant_iterator.h>
2019
#include <thrust/iterator/iterator_adaptor.h>
2120

2221
#include <cuda/__iterator/tabulate_output_iterator.h>

libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ function(libcudacxx_build_compiler_targets)
4444
# selected dialect target from cccl:
4545
target_link_libraries(
4646
libcudacxx.compiler_interface
47-
INTERFACE cccl.compiler_interface_cpp${CMAKE_CUDA_STANDARD}
47+
INTERFACE
48+
cccl.compiler_interface_cpp${CMAKE_CUDA_STANDARD}
49+
Thrust::Thrust
50+
CUB::CUB
4851
)
4952
endfunction()
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
#ifndef _CUDA___EXECUTION_POLICY_H
11+
#define _CUDA___EXECUTION_POLICY_H
12+
13+
#include <cuda/std/detail/__config>
14+
15+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16+
# pragma GCC system_header
17+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18+
# pragma clang system_header
19+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20+
# pragma system_header
21+
#endif // no system header
22+
23+
#if _CCCL_HAS_BACKEND_CUDA()
24+
25+
# include <cuda/__fwd/execution_policy.h>
26+
# include <cuda/std/__execution/policy.h>
27+
# include <cuda/std/__type_traits/is_execution_policy.h>
28+
29+
# include <cuda/std/__cccl/prologue.h>
30+
31+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
32+
33+
template <uint32_t _Policy>
34+
struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda>
35+
: __execution_policy_base<_Policy, __execution_backend::__none>
36+
{};
37+
38+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
39+
40+
_CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION
41+
42+
using __cub_parallel_unsequenced_policy =
43+
::cuda::std::execution::__execution_policy_base<::cuda::std::execution::__with_cuda_backend<static_cast<uint32_t>(
44+
::cuda::std::execution::__execution_policy::__parallel_unsequenced)>()>;
45+
_CCCL_GLOBAL_CONSTANT __cub_parallel_unsequenced_policy __cub_par_unseq{};
46+
47+
_CCCL_END_NAMESPACE_CUDA_EXECUTION
48+
49+
# include <cuda/std/__cccl/epilogue.h>
50+
51+
#endif // _CCCL_HAS_BACKEND_CUDA()
52+
53+
#endif // _CUDA___EXECUTION_POLICY_H
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
#ifndef _CUDA___FWD_EXECUTION_POLICY_H
11+
#define _CUDA___FWD_EXECUTION_POLICY_H
12+
13+
#include <cuda/std/detail/__config>
14+
15+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16+
# pragma GCC system_header
17+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18+
# pragma clang system_header
19+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20+
# pragma system_header
21+
#endif // no system header
22+
23+
#if _CCCL_HAS_BACKEND_CUDA()
24+
25+
# include <cuda/std/__fwd/execution_policy.h>
26+
27+
# include <cuda/std/__cccl/prologue.h>
28+
29+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
30+
31+
//! @brief Sets the execution backend to cuda
32+
template <uint32_t _Policy>
33+
[[nodiscard]] _CCCL_API constexpr uint32_t __with_cuda_backend() noexcept
34+
{
35+
constexpr uint32_t __backend_mask{0xFFFF00FF};
36+
constexpr uint32_t __new_policy =
37+
(_Policy & __backend_mask) | (static_cast<uint32_t>(__execution_backend::__cuda) << 8);
38+
return __new_policy;
39+
}
40+
41+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
42+
43+
# include <cuda/std/__cccl/epilogue.h>
44+
45+
#endif // _CCCL_HAS_BACKEND_CUDA()
46+
47+
#endif // _CUDA___FWD_EXECUTION_POLICY_H

libcudacxx/include/cuda/std/__execution/policy.h

Lines changed: 15 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -20,63 +20,43 @@
2020
# pragma system_header
2121
#endif // no system header
2222

23+
#include <cuda/std/__bit/has_single_bit.h>
24+
#include <cuda/std/__fwd/execution_policy.h>
2325
#include <cuda/std/cstdint>
2426

2527
#include <cuda/std/__cccl/prologue.h>
2628

2729
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
2830

29-
//! @brief Enumerates the standard execution policies
30-
enum class __execution_policy : uint8_t
31+
[[nodiscard]] _CCCL_API constexpr bool __has_unique_backend(const __execution_backend __backends) noexcept
3132
{
32-
__invalid_execution_policy = 0,
33-
__sequenced = 1 << 0,
34-
__parallel = 1 << 1,
35-
__unsequenced = 1 << 2,
36-
__parallel_unsequenced = __execution_policy::__parallel | __execution_policy::__unsequenced,
37-
};
38-
39-
//! @brief Enumerates the different backends we support
40-
//! @note Not an enum class because a user might specify multiple backends
41-
enum __execution_backend : uint8_t
42-
{
43-
// The backends we provide
44-
__none = 0,
45-
#if _CCCL_HAS_BACKEND_CUDA()
46-
__cuda = 1 << 1,
47-
#endif // _CCCL_HAS_BACKEND_CUDA()
48-
#if _CCCL_HAS_BACKEND_OMP()
49-
__omp = 1 << 2,
50-
#endif // _CCCL_HAS_BACKEND_OMP()
51-
#if _CCCL_HAS_BACKEND_TBB()
52-
__tbb = 1 << 3,
53-
#endif // _CCCL_HAS_BACKEND_TBB()
54-
};
33+
return ::cuda::std::has_single_bit(static_cast<uint32_t>(__backends));
34+
}
5535

5636
//! @brief Base class for our execution policies.
5737
//! It takes an untagged uint32_t because we want to be able to store 3 different enumerations in it.
58-
template <uint32_t _Policy>
38+
template <uint32_t _Policy, __execution_backend _Backend>
5939
struct __execution_policy_base
6040
{
61-
template <uint32_t _OtherPolicy>
41+
//! @brief Tag that identifies this and all derived classes as a CCCL execution policy
42+
static constexpr uint32_t __cccl_policy_ = _Policy;
43+
44+
template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
6245
[[nodiscard]] _CCCL_API friend constexpr bool
63-
operator==(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy>&) noexcept
46+
operator==(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept
6447
{
65-
return _Policy == _OtherPolicy;
48+
return _Policy == _OtherPolicy && _Backend == _OtherBackend;
6649
}
6750

6851
#if _CCCL_STD_VER <= 2017
69-
template <uint32_t _OtherPolicy>
52+
template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
7053
[[nodiscard]] _CCCL_API friend constexpr bool
71-
operator!=(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy>&) noexcept
54+
operator!=(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept
7255
{
73-
return _Policy != _OtherPolicy;
56+
return _Policy != _OtherPolicy || _Backend != _OtherBackend;
7457
}
7558
#endif // _CCCL_STD_VER <= 2017
7659

77-
//! @brief Tag that identifies this and all derived classes as a CCCL execution policy
78-
static constexpr uint32_t __cccl_policy_ = _Policy;
79-
8060
//! @brief Extracts the execution policy from the stored _Policy
8161
[[nodiscard]] _CCCL_API static constexpr __execution_policy __get_policy() noexcept
8262
{
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___FWD_EXECUTION_POLICY_H
12+
#define _CUDA_STD___FWD_EXECUTION_POLICY_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#include <cuda/std/cstdint>
25+
26+
#include <cuda/std/__cccl/prologue.h>
27+
28+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
29+
30+
//! @brief Enumerates the standard execution policies
31+
enum class __execution_policy : uint8_t
32+
{
33+
__invalid_execution_policy = 0,
34+
__sequenced = 1 << 0,
35+
__parallel = 1 << 1,
36+
__unsequenced = 1 << 2,
37+
__parallel_unsequenced = __execution_policy::__parallel | __execution_policy::__unsequenced,
38+
};
39+
40+
//! @brief Enumerates the different backends we support
41+
//! @note Not an enum class because a user might specify multiple backends
42+
enum __execution_backend : uint8_t
43+
{
44+
// The backends we provide
45+
__none = 0,
46+
#if _CCCL_HAS_BACKEND_CUDA()
47+
__cuda = 1 << 1,
48+
#endif // _CCCL_HAS_BACKEND_CUDA()
49+
#if _CCCL_HAS_BACKEND_OMP()
50+
__omp = 1 << 2,
51+
#endif // _CCCL_HAS_BACKEND_OMP()
52+
#if _CCCL_HAS_BACKEND_TBB()
53+
__tbb = 1 << 3,
54+
#endif // _CCCL_HAS_BACKEND_TBB()
55+
};
56+
57+
//! @brief Extracts the execution backend from the stored _Policy
58+
template <uint32_t _Policy>
59+
inline constexpr __execution_backend __to_backend = __execution_backend{(_Policy & uint32_t{0x0000FF00}) >> 8};
60+
61+
template <uint32_t _Policy, __execution_backend _Backend = __to_backend<_Policy>>
62+
struct __execution_policy_base;
63+
64+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
65+
66+
#include <cuda/std/__cccl/epilogue.h>
67+
68+
#endif // _CUDA_STD___FWD_EXECUTION_POLICY_H

libcudacxx/include/cuda/std/__internal/pstl_config.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
#include <cuda/std/__cccl/prologue.h>
2525

26-
#define _CCCL_HAS_BACKEND_CUDA() 0
26+
#define _CCCL_HAS_BACKEND_CUDA() _CCCL_CUDA_COMPILATION() && !_CCCL_COMPILER(NVRTC)
2727
#define _CCCL_HAS_BACKEND_OMP() 0
2828
#define _CCCL_HAS_BACKEND_TBB() 0
2929

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H
12+
#define _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if _CCCL_HAS_BACKEND_CUDA()
25+
26+
# include <cub/device/device_for.cuh>
27+
28+
# include <cuda/__execution/policy.h>
29+
# include <cuda/__runtime/api_wrapper.h>
30+
# include <cuda/__stream/stream_ref.h>
31+
# include <cuda/std/__algorithm/for_each_n.h>
32+
# include <cuda/std/__exception/cuda_error.h>
33+
# include <cuda/std/__exception/terminate.h>
34+
# include <cuda/std/__execution/policy.h>
35+
# include <cuda/std/__iterator/iterator_traits.h>
36+
# include <cuda/std/__pstl/dispatch.h>
37+
# include <cuda/std/__type_traits/always_false.h>
38+
# include <cuda/std/__utility/convert_to_integral.h>
39+
# include <cuda/std/__utility/move.h>
40+
41+
# include <nv/target>
42+
43+
# include <cuda/std/__cccl/prologue.h>
44+
45+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
46+
47+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
48+
49+
template <>
50+
struct __pstl_dispatch<__pstl_algorithm::__for_each_n, __execution_backend::__cuda>
51+
{
52+
template <class _Policy, class _Iter, class _Size, class _Fn>
53+
[[nodiscard]] _CCCL_HOST_API static _Iter
54+
__par_impl([[maybe_unused]] _Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) noexcept
55+
{
56+
const auto __count = ::cuda::std::__convert_to_integral(__orig_n);
57+
::cuda::stream_ref __stream{cudaStreamPerThread};
58+
59+
_CCCL_TRY_CUDA_API(
60+
::cub::DeviceFor::ForEachN,
61+
"__pstl_dispatch: kernel launch failed",
62+
__first,
63+
__count,
64+
::cuda::std::move(__func),
65+
__stream.get());
66+
67+
__stream.sync();
68+
69+
return __first + __count;
70+
}
71+
72+
template <class _Policy, class _Iter, class _Size, class _Fn>
73+
[[nodiscard]] _CCCL_HOST_API _CCCL_FORCEINLINE _Iter
74+
operator()(_Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) const noexcept
75+
{
76+
if constexpr (::cuda::std::__has_random_access_traversal<_Iter>)
77+
{
78+
return __par_impl(::cuda::std::move(__policy), ::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func));
79+
}
80+
else
81+
{
82+
static_assert(__always_false_v<_Policy>,
83+
"__pstl_dispatch: CUDA backend of cuda::std::for_each_n requires at least random access iterators");
84+
return ::cuda::std::for_each_n(::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func));
85+
}
86+
}
87+
};
88+
89+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
90+
91+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
92+
93+
# include <cuda/std/__cccl/epilogue.h>
94+
95+
#endif /// _CCCL_HAS_BACKEND_CUDA()
96+
97+
#endif // _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H

0 commit comments

Comments
 (0)