diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index 533bb5cedaf..933f489be6f 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/cudax/include/cuda/experimental/__execution/policy.cuh b/cudax/include/cuda/experimental/__execution/policy.cuh index e5f9248aae9..d004dfa06a5 100644 --- a/cudax/include/cuda/experimental/__execution/policy.cuh +++ b/cudax/include/cuda/experimental/__execution/policy.cuh @@ -44,9 +44,9 @@ struct any_execution_policy _CCCL_HIDE_FROM_ABI any_execution_policy() = default; - template <__execution_policy _Policy> - _CCCL_HOST_API constexpr any_execution_policy(::cuda::std::execution::__policy<_Policy>) noexcept - : value(_Policy) + template + _CCCL_HOST_API constexpr any_execution_policy(::cuda::std::execution::__execution_policy_base<_Policy>) noexcept + : value(value_type{_Policy}) {} _CCCL_HOST_API constexpr operator __execution_policy() const noexcept @@ -54,38 +54,38 @@ struct any_execution_policy return value; } - _CCCL_HOST_API constexpr auto operator()() const noexcept -> __execution_policy + _CCCL_HOST_API constexpr auto operator()() const noexcept -> value_type { return value; } - template <__execution_policy _Policy> + template [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator==(const any_execution_policy& pol, const ::cuda::std::execution::__policy<_Policy>&) noexcept + operator==(const any_execution_policy& pol, const ::cuda::std::execution::__execution_policy_base<_Policy>&) noexcept { - return pol.value == _Policy; + return pol.value == value_type{_Policy}; } #if _CCCL_STD_VER <= 2017 - template <__execution_policy _Policy> + template [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator==(const ::cuda::std::execution::__policy<_Policy>&, const any_execution_policy& pol) noexcept + operator==(const ::cuda::std::execution::__execution_policy_base<_Policy>&, const any_execution_policy& pol) noexcept { - return pol.value == _Policy; + return pol.value == value_type{_Policy}; } - template <__execution_policy _Policy> + template [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator!=(const any_execution_policy& pol, const ::cuda::std::execution::__policy<_Policy>&) noexcept + operator!=(const any_execution_policy& pol, const ::cuda::std::execution::__execution_policy_base<_Policy>&) noexcept { - return pol.value != _Policy; + return pol.value != value_type{_Policy}; } - template <__execution_policy _Policy> + template [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator!=(const ::cuda::std::execution::__policy<_Policy>&, const any_execution_policy& pol) + operator!=(const ::cuda::std::execution::__execution_policy_base<_Policy>&, const any_execution_policy& pol) { - return pol.value != _Policy; + return pol.value != value_type{_Policy}; } #endif // _CCCL_STD_VER <= 2017 diff --git a/libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake b/libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake index 961721d6d5c..ac97d395c28 100644 --- a/libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake +++ b/libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake @@ -53,5 +53,7 @@ function(libcudacxx_build_compiler_targets) # order matters here, we need the libcudacxx options to override the cccl options. cccl.compiler_interface libcudacxx.compiler_flags + Thrust::Thrust + CUB::CUB ) endfunction() diff --git a/libcudacxx/include/cuda/__execution/policy.h b/libcudacxx/include/cuda/__execution/policy.h new file mode 100644 index 00000000000..3613a4099e9 --- /dev/null +++ b/libcudacxx/include/cuda/__execution/policy.h @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___EXECUTION_POLICY_H +#define _CUDA___EXECUTION_POLICY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +template +struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda> + : __execution_policy_base<_Policy, __execution_backend::__none> +{}; + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +_CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION + +using __cub_parallel_unsequenced_policy = + ::cuda::std::execution::__execution_policy_base<::cuda::std::execution::__with_cuda_backend( + ::cuda::std::execution::__execution_policy::__parallel_unsequenced)>()>; +_CCCL_GLOBAL_CONSTANT __cub_parallel_unsequenced_policy __cub_par_unseq{}; + +_CCCL_END_NAMESPACE_CUDA_EXECUTION + +# include + +#endif // _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA___EXECUTION_POLICY_H diff --git a/libcudacxx/include/cuda/__fwd/execution_policy.h b/libcudacxx/include/cuda/__fwd/execution_policy.h new file mode 100644 index 00000000000..d0928b10291 --- /dev/null +++ b/libcudacxx/include/cuda/__fwd/execution_policy.h @@ -0,0 +1,47 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FWD_EXECUTION_POLICY_H +#define _CUDA___FWD_EXECUTION_POLICY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +//! @brief Sets the execution backend to cuda +template +[[nodiscard]] _CCCL_API constexpr uint32_t __with_cuda_backend() noexcept +{ + constexpr uint32_t __backend_mask{0xFFFF00FF}; + constexpr uint32_t __new_policy = + (_Policy & __backend_mask) | (static_cast(__execution_backend::__cuda) << 8); + return __new_policy; +} + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +# include + +#endif // _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA___FWD_EXECUTION_POLICY_H diff --git a/libcudacxx/include/cuda/std/__execution/policy.h b/libcudacxx/include/cuda/std/__execution/policy.h index 4ee74d48fc0..d4bf5e05c4a 100644 --- a/libcudacxx/include/cuda/std/__execution/policy.h +++ b/libcudacxx/include/cuda/std/__execution/policy.h @@ -20,65 +20,67 @@ # pragma system_header #endif // no system header -#include +#include +#include #include #include _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION -enum class __execution_policy : uint32_t +[[nodiscard]] _CCCL_API constexpr bool __has_unique_backend(const __execution_backend __backends) noexcept { - __invalid_execution_policy = 0, - __sequenced = 1 << 0, - __parallel = 1 << 1, - __unsequenced = 1 << 2, - __parallel_unsequenced = __execution_policy::__parallel | __execution_policy::__unsequenced, -}; - -[[nodiscard]] _CCCL_API constexpr bool -__satisfies_execution_policy(__execution_policy __lhs, __execution_policy __rhs) noexcept -{ - return (static_cast(__lhs) & static_cast(__rhs)) != 0; + return ::cuda::std::has_single_bit(static_cast(__backends)); } -template <__execution_policy _Policy> -struct __policy +//! @brief Base class for our execution policies. +//! It takes an untagged uint32_t because we want to be able to store 3 different enumerations in it. +template +struct __execution_policy_base { - template <__execution_policy _OtherPolicy> - [[nodiscard]] _CCCL_API friend constexpr bool operator==(const __policy&, const __policy<_OtherPolicy>&) noexcept + //! @brief Tag that identifies this and all derived classes as a CCCL execution policy + static constexpr uint32_t __cccl_policy_ = _Policy; + + template + [[nodiscard]] _CCCL_API friend constexpr bool + operator==(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept { - using __underlying_t = underlying_type_t<__execution_policy>; - return (static_cast<__underlying_t>(_Policy) == static_cast<__underlying_t>(_OtherPolicy)); + return _Policy == _OtherPolicy; } #if _CCCL_STD_VER <= 2017 - template <__execution_policy _OtherPolicy> - [[nodiscard]] _CCCL_API friend constexpr bool operator!=(const __policy&, const __policy<_OtherPolicy>&) noexcept + template + [[nodiscard]] _CCCL_API friend constexpr bool + operator!=(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept { - using __underlying_t = underlying_type_t<__execution_policy>; - return (static_cast<__underlying_t>(_Policy) != static_cast<__underlying_t>(_OtherPolicy)); + return _Policy != _OtherPolicy; } #endif // _CCCL_STD_VER <= 2017 - static constexpr __execution_policy __policy_ = _Policy; -}; + //! @brief Extracts the execution policy from the stored _Policy + [[nodiscard]] _CCCL_API static constexpr __execution_policy __get_policy() noexcept + { + return __policy_to_execution_policy<_Policy>; + } -struct sequenced_policy : public __policy<__execution_policy::__sequenced> -{}; + //! @brief Extracts the execution backend from the stored _Policy + [[nodiscard]] _CCCL_API static constexpr __execution_backend __get_backend() noexcept + { + return __policy_to_execution_backend<_Policy>; + } +}; +using sequenced_policy = __execution_policy_base(__execution_policy::__sequenced)>; _CCCL_GLOBAL_CONSTANT sequenced_policy seq{}; -struct parallel_policy : public __policy<__execution_policy::__parallel> -{}; +using parallel_policy = __execution_policy_base(__execution_policy::__parallel)>; _CCCL_GLOBAL_CONSTANT parallel_policy par{}; -struct parallel_unsequenced_policy : public __policy<__execution_policy::__parallel_unsequenced> -{}; +using parallel_unsequenced_policy = + __execution_policy_base(__execution_policy::__parallel_unsequenced)>; _CCCL_GLOBAL_CONSTANT parallel_unsequenced_policy par_unseq{}; -struct unsequenced_policy : public __policy<__execution_policy::__unsequenced> -{}; +using unsequenced_policy = __execution_policy_base(__execution_policy::__unsequenced)>; _CCCL_GLOBAL_CONSTANT unsequenced_policy unseq{}; _CCCL_END_NAMESPACE_CUDA_STD_EXECUTION diff --git a/libcudacxx/include/cuda/std/__fwd/execution_policy.h b/libcudacxx/include/cuda/std/__fwd/execution_policy.h new file mode 100644 index 00000000000..61f183e13bf --- /dev/null +++ b/libcudacxx/include/cuda/std/__fwd/execution_policy.h @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___FWD_EXECUTION_POLICY_H +#define _CUDA_STD___FWD_EXECUTION_POLICY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +//! @brief Enumerates the standard execution policies +enum class __execution_policy : uint8_t +{ + __invalid_execution_policy = 0, + __sequenced = 1 << 0, + __parallel = 1 << 1, + __unsequenced = 1 << 2, + __parallel_unsequenced = __execution_policy::__parallel | __execution_policy::__unsequenced, +}; + +//! @brief Extracts the execution policy from the stored _Policy +template +inline constexpr __execution_policy __policy_to_execution_policy = __execution_policy{(_Policy & uint32_t{0x000000FF})}; + +//! @brief Enumerates the different backends we support +//! @note Not an enum class because a user might specify multiple backends +enum __execution_backend : uint8_t +{ + // The backends we provide + __none = 0, +#if _CCCL_HAS_BACKEND_CUDA() + __cuda = 1 << 1, +#endif // _CCCL_HAS_BACKEND_CUDA() +#if _CCCL_HAS_BACKEND_OMP() + __omp = 1 << 2, +#endif // _CCCL_HAS_BACKEND_OMP() +#if _CCCL_HAS_BACKEND_TBB() + __tbb = 1 << 3, +#endif // _CCCL_HAS_BACKEND_TBB() +}; + +//! @brief Extracts the execution backend from the stored _Policy +template +inline constexpr __execution_backend __policy_to_execution_backend = + __execution_backend{(_Policy & uint32_t{0x0000FF00}) >> 8}; + +template > +struct __execution_policy_base; + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +#include + +#endif // _CUDA_STD___FWD_EXECUTION_POLICY_H diff --git a/libcudacxx/include/cuda/std/__internal/pstl_config.h b/libcudacxx/include/cuda/std/__internal/pstl_config.h new file mode 100644 index 00000000000..9c08c05294d --- /dev/null +++ b/libcudacxx/include/cuda/std/__internal/pstl_config.h @@ -0,0 +1,32 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___INTERNAL_PSTL_CONFIG_H +#define _CUDA_STD___INTERNAL_PSTL_CONFIG_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#define _CCCL_HAS_BACKEND_CUDA() _CCCL_CUDA_COMPILATION() && !_CCCL_COMPILER(NVRTC) +#define _CCCL_HAS_BACKEND_OMP() 0 +#define _CCCL_HAS_BACKEND_TBB() 0 + +#include + +#endif // _CUDA_STD___INTERNAL_PSTL_CONFIG_H diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h b/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h new file mode 100644 index 00000000000..78a95b5e221 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H +#define _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +# include + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +template <> +struct __pstl_dispatch<__pstl_algorithm::__for_each_n, __execution_backend::__cuda> +{ + template + [[nodiscard]] _CCCL_HOST_API static _Iter + __par_impl([[maybe_unused]] _Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) noexcept + { + const auto __count = ::cuda::std::__convert_to_integral(__orig_n); + ::cuda::stream_ref __stream{cudaStreamPerThread}; + + _CCCL_TRY_CUDA_API( + ::cub::DeviceFor::ForEachN, + "__pstl_dispatch: kernel launch failed", + __first, + __count, + ::cuda::std::move(__func), + __stream.get()); + + __stream.sync(); + + return __first + __count; + } + + template + [[nodiscard]] _CCCL_HOST_API _CCCL_FORCEINLINE _Iter + operator()(_Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) const noexcept + { + if constexpr (::cuda::std::__has_random_access_traversal<_Iter>) + { + return __par_impl(::cuda::std::move(__policy), ::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func)); + } + else + { + static_assert(__always_false_v<_Policy>, + "__pstl_dispatch: CUDA backend of cuda::std::for_each_n requires at least random access iterators"); + return ::cuda::std::for_each_n(::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func)); + } + } +}; + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +# include + +#endif /// _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H diff --git a/libcudacxx/include/cuda/std/__pstl/dispatch.h b/libcudacxx/include/cuda/std/__pstl/dispatch.h new file mode 100644 index 00000000000..22b49051f90 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/dispatch.h @@ -0,0 +1,123 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_DISPATCH_H +#define _CUDA_STD___PSTL_DISPATCH_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +enum class __pstl_algorithm +{ + // The find_if family + __find, + __find_if, + __any_of, + __all_of, + __none_of, + __is_partitioned, + + // merge family + // non implemented + + // sort family + __sort, + + // for_each family + __for_each_n, + __fill, + __fill_n, + __replace, + __replace_if, + __generate, + __generate_n, + + // transform_reduce and transform_reduce_binary family + __count_if, + __count, + __equal, + __reduce, + + // transform and transform_binary family + __replace_copy_if, + __replace_copy, + __move, + __copy, + __copy_n, + __rotate_copy, +}; + +//! @brief tag type to indicate that we cannot dispatch to a parallel algorithm and should run the algorithm serially +struct __pstl_no_dispatch +{}; + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +//! @brief Dispatcher for a given @tparam _Algorith and @tparam _Policy +//! If @class __pstl_dispatch is not specialized by the chosen backend we will fall back to serial execution +template <__pstl_algorithm _Algorithm, __execution_backend _Backend> +struct __pstl_dispatch : public __pstl_no_dispatch +{}; + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +//! @brief Helper variable that detects whether @class __pstl_dispatch has been specialized so that we can +//! dispatch +template +inline constexpr bool __pstl_can_dispatch = false; + +template <__pstl_algorithm _Algorithm, __execution_backend _Backend> +inline constexpr bool __pstl_can_dispatch<__pstl_dispatch<_Algorithm, _Backend>> = + !::cuda::std::is_base_of_v<__pstl_no_dispatch, __pstl_dispatch<_Algorithm, _Backend>>; + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +//! @brief Top layer dispatcher that returns a concrete dispatch if possible +template <__pstl_algorithm _Algorithm, class _Policy> +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __pstl_select_dispatch() noexcept +{ + // First extract the desired backend from the policy + constexpr __execution_backend __backend = _Policy::__get_backend(); + + // If the user requests a unique backends, we must take that + if constexpr (::cuda::std::execution::__has_unique_backend(__backend)) + { + return __pstl_dispatch<_Algorithm, __backend>{}; + } + else + { + // No dispatch found, return invalid to signal serial execution + return __pstl_dispatch<_Algorithm, __execution_backend::__none>{}; + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +#include + +#endif // _CUDA_STD___PSTL_DISPATCH_H diff --git a/libcudacxx/include/cuda/std/__pstl/for_each.h b/libcudacxx/include/cuda/std/__pstl/for_each.h new file mode 100644 index 00000000000..6bdf0bf9b28 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/for_each.h @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_FOR_EACH_H +#define _CUDA_STD___PSTL_FOR_EACH_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, class _Iter, class _Fn) +_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>) +_CCCL_HOST_API void for_each([[maybe_unused]] _Policy __policy, _Iter __first, _Iter __last, _Fn __func) +{ + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__for_each_n, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + (void) __dispatch(::cuda::std::move(__policy), + ::cuda::std::move(__first), + ::cuda::std::distance(__first, __last), + ::cuda::std::move(__func)); + } + else + { + static_assert(__always_false_v<_Policy>, "Parallel cuda::std::for_each requires at least one selected backend"); + ::cuda::std::for_each(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__func)); + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_FOR_EACH_H diff --git a/libcudacxx/include/cuda/std/__pstl/for_each_n.h b/libcudacxx/include/cuda/std/__pstl/for_each_n.h new file mode 100644 index 00000000000..082e1a0d385 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/for_each_n.h @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_FOR_EACH_N_H +#define _CUDA_STD___PSTL_FOR_EACH_N_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, class _Iter, class _Size, class _Fn) +_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>) +_CCCL_HOST_API _Iter for_each_n([[maybe_unused]] _Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) +{ + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__for_each_n, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + return __dispatch(::cuda::std::move(__policy), ::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func)); + } + else + { + static_assert(__always_false_v<_Policy>, "Parallel cuda::std::for_each_n requires at least one selected backend"); + return ::cuda::std::for_each_n(::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func)); + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_FOR_EACH_N_H diff --git a/libcudacxx/include/cuda/std/__type_traits/is_execution_policy.h b/libcudacxx/include/cuda/std/__type_traits/is_execution_policy.h index 7114201e056..338b5aa76fb 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_execution_policy.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_execution_policy.h @@ -20,7 +20,7 @@ # pragma system_header #endif // no system header -#include +#include #include #include @@ -28,51 +28,33 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD -template +template inline constexpr bool is_execution_policy_v = false; -// Ensure we ignore cv qualifiers -template -inline constexpr bool is_execution_policy_v = is_execution_policy_v<_Tp>; - -template -inline constexpr bool is_execution_policy_v = is_execution_policy_v<_Tp>; - -template -inline constexpr bool is_execution_policy_v = is_execution_policy_v<_Tp>; - -// Explicitly mark our execution policies as such -template <> -inline constexpr bool is_execution_policy_v<::cuda::std::execution::sequenced_policy> = true; - -template <> -inline constexpr bool is_execution_policy_v<::cuda::std::execution::parallel_policy> = true; - -template <> -inline constexpr bool is_execution_policy_v<::cuda::std::execution::parallel_unsequenced_policy> = true; - -template <> -inline constexpr bool is_execution_policy_v<::cuda::std::execution::unsequenced_policy> = true; +template +inline constexpr bool is_execution_policy_v<_Policy, void_t> = true; template struct _CCCL_NO_SPECIALIZATIONS is_execution_policy : bool_constant> {}; // Detect parallel policies -template +template > inline constexpr bool __is_parallel_execution_policy_v = false; template -inline constexpr bool __is_parallel_execution_policy_v<_Policy, void_t> = - __satisfies_execution_policy(_Policy::__policy_, ::cuda::std::execution::__execution_policy::__parallel); +inline constexpr bool __is_parallel_execution_policy_v<_Policy, true> = + _Policy::__get_policy() == ::cuda::std::execution::__execution_policy::__parallel + || _Policy::__get_policy() == ::cuda::std::execution::__execution_policy::__parallel_unsequenced; // Detect unsequenced policies -template +template > inline constexpr bool __is_unsequenced_execution_policy_v = false; template -inline constexpr bool __is_unsequenced_execution_policy_v<_Policy, void_t> = - __satisfies_execution_policy(_Policy::__policy_, ::cuda::std::execution::__execution_policy::__unsequenced); +inline constexpr bool __is_unsequenced_execution_policy_v<_Policy, true> = + _Policy::__get_policy() == ::cuda::std::execution::__execution_policy::__unsequenced + || _Policy::__get_policy() == ::cuda::std::execution::__execution_policy::__parallel_unsequenced; _CCCL_END_NAMESPACE_CUDA_STD diff --git a/libcudacxx/include/cuda/std/detail/__config b/libcudacxx/include/cuda/std/detail/__config index 5cf896c89c2..a3ebabc5fe3 100644 --- a/libcudacxx/include/cuda/std/detail/__config +++ b/libcudacxx/include/cuda/std/detail/__config @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl.for_each.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl.for_each.pass.cpp new file mode 100644 index 00000000000..6eace2d56b7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl.for_each.pass.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc +// XFAIL: true + +// template +// void for_each(ExecutionPolicy&& exec, +// ForwardIterator first, ForwardIterator last, +// Function f); + +#include +#include +#include + +#include "test_execution_policies.h" +#include "test_iterators.h" +#include "test_macros.h" + +EXECUTION_POLICY_SFINAE_TEST(for_each); + +static_assert(!sfinae_test_for_each); +static_assert(sfinae_test_for_each); + +int data[100]; +bool called[100]; + +template +struct Test +{ + template + void operator()(Policy&& policy) + { + int sizes[] = {0, 1, 2, 100}; + for (auto size : sizes) + { + cuda::std::fill(called, called + size, false); + cuda::std::for_each(policy, Iter(data), Iter(data + size), [&](int& v) { + assert(!called[&v - data]); + called[&v - data] = true; + }); + assert(cuda::std::all_of(called, called + size, [](bool b) { + return b; + })); + } + } +}; + +__host__ void test() +{ + types::for_each(types::forward_iterator_list{}, TestIteratorWithPolicies{}); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, test();) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl.for_each_n.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl.for_each_n.pass.cpp new file mode 100644 index 00000000000..6a05cabc69f --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl.for_each_n.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc +// XFAIL: true + +// template +// ForwardIterator for_each_n(ExecutionPolicy&& exec, ForwardIterator first, Size n, +// Function f); + +#include +#include +#include + +#include "test_execution_policies.h" +#include "test_iterators.h" +#include "test_macros.h" + +EXECUTION_POLICY_SFINAE_TEST(for_each_n); + +static_assert(!sfinae_test_for_each_n); +static_assert(sfinae_test_for_each_n); + +int data[100]; +bool called[100]; + +template +struct Test +{ + template + void operator()(Policy&& policy) + { + int sizes[] = {0, 1, 2, 100}; + for (auto size : sizes) + { + cuda::std::fill(called, called + size, false); + cuda::std::for_each_n(policy, Iter(data), size, [&](int& v) { + assert(!called[&v - data]); + called[&v - data] = true; + }); + assert(cuda::std::all_of(called, called + size, [](bool b) { + return b; + })); + } + } +}; + +__host__ void test() +{ + types::for_each(types::forward_iterator_list{}, TestIteratorWithPolicies{}); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, test();) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu new file mode 100644 index 00000000000..286324f2ec7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template +// void for_each(ExecutionPolicy&& exec, +// ForwardIterator first, ForwardIterator last, +// Function f); + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +inline constexpr int size = 1000; + +struct mark_present_for_each +{ + bool* ptr_; + + template + __host__ __device__ void operator()(T val) const noexcept + { + ptr_[val] = true; + } +}; + +C2H_TEST("cuda::std::for_each", "[parallel algorithm]") +{ + thrust::device_vector res(size, false); + mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + + const auto policy = cuda::execution::__cub_par_unseq; + cuda::std::for_each(policy, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn); + CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); +} diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu new file mode 100644 index 00000000000..60f17d59eaf --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template +// void for_each(ExecutionPolicy&& exec, +// ForwardIterator first, ForwardIterator last, +// Function f); + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +inline constexpr int size = 1000; + +struct mark_present_for_each +{ + bool* ptr_; + + template + __host__ __device__ void operator()(T val) const noexcept + { + ptr_[val] = true; + } +}; + +C2H_TEST("cuda::std::for_each_n", "[parallel algorithm]") +{ + thrust::device_vector res(size, false); + mark_present_for_each fn{thrust::raw_pointer_cast(res.data())}; + + const auto policy = cuda::execution::__cub_par_unseq; + cuda::std::for_each_n(policy, cuda::counting_iterator{0}, size, fn); + CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{})); +} diff --git a/libcudacxx/test/support/test_execution_policies.h b/libcudacxx/test/support/test_execution_policies.h new file mode 100644 index 00000000000..d6adaf64a26 --- /dev/null +++ b/libcudacxx/test/support/test_execution_policies.h @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_SUPPORT_TEST_EXECUTION_POLICIES +#define TEST_SUPPORT_TEST_EXECUTION_POLICIES + +#include +#include +#include +#include + +#include "test_macros.h" + +#define EXECUTION_POLICY_SFINAE_TEST(FUNCTION) \ + template \ + struct sfinae_test_##FUNCTION##_impl : cuda::std::false_type \ + {}; \ + \ + template \ + struct sfinae_test_##FUNCTION##_impl()...))>, \ + Args...> : cuda::std::true_type \ + {}; \ + \ + template \ + inline constexpr bool sfinae_test_##FUNCTION = sfinae_test_##FUNCTION##_impl::value; + +_CCCL_EXEC_CHECK_DISABLE +template +__host__ __device__ bool test_execution_policies(Functor func) +{ + func(cuda::std::execution::seq); + func(cuda::std::execution::unseq); + func(cuda::std::execution::par); + func(cuda::std::execution::par_unseq); + + return true; +} + +template