NVIDIA
diff --git a/‎cub/cub/device/dispatch/dispatch_streaming_reduce.cuh‎
Lines changed: 0 additions & 1 deletion b/‎cub/cub/device/dispatch/dispatch_streaming_reduce.cuh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake‎
Lines changed: 4 additions & 1 deletion b/‎libcudacxx/cmake/LibcudacxxBuildCompilerTargets.cmake‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎libcudacxx/include/cuda/__execution/policy.h‎
Lines changed: 53 additions & 0 deletions b/‎libcudacxx/include/cuda/__execution/policy.h‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎libcudacxx/include/cuda/__fwd/execution_policy.h‎
Lines changed: 47 additions & 0 deletions b/‎libcudacxx/include/cuda/__fwd/execution_policy.h‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎libcudacxx/include/cuda/std/__execution/policy.h‎
Lines changed: 15 additions & 35 deletions b/‎libcudacxx/include/cuda/std/__execution/policy.h‎
Lines changed: 15 additions & 35 deletions
diff --git a/‎libcudacxx/include/cuda/std/__fwd/execution_policy.h‎
Lines changed: 68 additions & 0 deletions b/‎libcudacxx/include/cuda/std/__fwd/execution_policy.h‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎libcudacxx/include/cuda/std/__internal/pstl_config.h‎
Lines changed: 1 addition & 1 deletion b/‎libcudacxx/include/cuda/std/__internal/pstl_config.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h‎
Lines changed: 97 additions & 0 deletions b/‎libcudacxx/include/cuda/std/__pstl/cuda/for_each_n.h‎
Lines changed: 97 additions & 0 deletions
@@ -16,7 +16,6 @@
 #include <cub/device/dispatch/dispatch_reduce.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
 
-#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
 
 #include <cuda/__iterator/tabulate_output_iterator.h>
 
@@ -44,6 +44,9 @@ function(libcudacxx_build_compiler_targets)
   # selected dialect target from cccl:
   target_link_libraries(
     libcudacxx.compiler_interface
-    INTERFACE cccl.compiler_interface_cpp${CMAKE_CUDA_STANDARD}
+    INTERFACE
+      cccl.compiler_interface_cpp${CMAKE_CUDA_STANDARD}
+      Thrust::Thrust
+      CUB::CUB
   )
 endfunction()
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___EXECUTION_POLICY_H
+#define _CUDA___EXECUTION_POLICY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/__fwd/execution_policy.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+template <uint32_t _Policy>
+struct _CCCL_DECLSPEC_EMPTY_BASES __execution_policy_base<_Policy, __execution_backend::__cuda>
+    : __execution_policy_base<_Policy, __execution_backend::__none>
+{};
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION
+
+using __cub_parallel_unsequenced_policy =
+  ::cuda::std::execution::__execution_policy_base<::cuda::std::execution::__with_cuda_backend<static_cast<uint32_t>(
+    ::cuda::std::execution::__execution_policy::__parallel_unsequenced)>()>;
+_CCCL_GLOBAL_CONSTANT __cub_parallel_unsequenced_policy __cub_par_unseq{};
+
+_CCCL_END_NAMESPACE_CUDA_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA___EXECUTION_POLICY_H
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FWD_EXECUTION_POLICY_H
+#define _CUDA___FWD_EXECUTION_POLICY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__fwd/execution_policy.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+//! @brief Sets the execution backend to cuda
+template <uint32_t _Policy>
+[[nodiscard]] _CCCL_API constexpr uint32_t __with_cuda_backend() noexcept
+{
+  constexpr uint32_t __backend_mask{0xFFFF00FF};
+  constexpr uint32_t __new_policy =
+    (_Policy & __backend_mask) | (static_cast<uint32_t>(__execution_backend::__cuda) << 8);
+  return __new_policy;
+}
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA___FWD_EXECUTION_POLICY_H
@@ -20,63 +20,43 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__fwd/execution_policy.h>
 #include <cuda/std/cstdint>
 
 #include <cuda/std/__cccl/prologue.h>
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
 
-//! @brief Enumerates the standard execution policies
-enum class __execution_policy : uint8_t
+[[nodiscard]] _CCCL_API constexpr bool __has_unique_backend(const __execution_backend __backends) noexcept
 {
-  __invalid_execution_policy = 0,
-  __sequenced                = 1 << 0,
-  __parallel                 = 1 << 1,
-  __unsequenced              = 1 << 2,
-  __parallel_unsequenced     = __execution_policy::__parallel | __execution_policy::__unsequenced,
-};
-
-//! @brief Enumerates the different backends we support
-//! @note Not an enum class because a user might specify multiple backends
-enum __execution_backend : uint8_t
-{
-  // The backends we provide
-  __none = 0,
-#if _CCCL_HAS_BACKEND_CUDA()
-  __cuda = 1 << 1,
-#endif // _CCCL_HAS_BACKEND_CUDA()
-#if _CCCL_HAS_BACKEND_OMP()
-  __omp = 1 << 2,
-#endif // _CCCL_HAS_BACKEND_OMP()
-#if _CCCL_HAS_BACKEND_TBB()
-  __tbb = 1 << 3,
-#endif // _CCCL_HAS_BACKEND_TBB()
-};
+  return ::cuda::std::has_single_bit(static_cast<uint32_t>(__backends));
+}
 
 //! @brief Base class for our execution policies.
 //! It takes an untagged uint32_t because we want to be able to store 3 different enumerations in it.
-template <uint32_t _Policy>
+template <uint32_t _Policy, __execution_backend _Backend>
 struct __execution_policy_base
 {
-  template <uint32_t _OtherPolicy>
+  //! @brief Tag that identifies this and all derived classes as a CCCL execution policy
+  static constexpr uint32_t __cccl_policy_ = _Policy;
+
+  template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
   [[nodiscard]] _CCCL_API friend constexpr bool
-  operator==(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy>&) noexcept
+  operator==(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept
   {
-    return _Policy == _OtherPolicy;
+    return _Policy == _OtherPolicy && _Backend == _OtherBackend;
   }
 
 #if _CCCL_STD_VER <= 2017
-  template <uint32_t _OtherPolicy>
+  template <uint32_t _OtherPolicy, __execution_backend _OtherBackend>
   [[nodiscard]] _CCCL_API friend constexpr bool
-  operator!=(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy>&) noexcept
+  operator!=(const __execution_policy_base&, const __execution_policy_base<_OtherPolicy, _OtherBackend>&) noexcept
   {
-    return _Policy != _OtherPolicy;
+    return _Policy != _OtherPolicy || _Backend != _OtherBackend;
   }
 #endif // _CCCL_STD_VER <= 2017
 
-  //! @brief Tag that identifies this and all derived classes as a CCCL execution policy
-  static constexpr uint32_t __cccl_policy_ = _Policy;
-
   //! @brief Extracts the execution policy from the stored _Policy
   [[nodiscard]] _CCCL_API static constexpr __execution_policy __get_policy() noexcept
   {
 
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___FWD_EXECUTION_POLICY_H
+#define _CUDA_STD___FWD_EXECUTION_POLICY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+//! @brief Enumerates the standard execution policies
+enum class __execution_policy : uint8_t
+{
+  __invalid_execution_policy = 0,
+  __sequenced                = 1 << 0,
+  __parallel                 = 1 << 1,
+  __unsequenced              = 1 << 2,
+  __parallel_unsequenced     = __execution_policy::__parallel | __execution_policy::__unsequenced,
+};
+
+//! @brief Enumerates the different backends we support
+//! @note Not an enum class because a user might specify multiple backends
+enum __execution_backend : uint8_t
+{
+  // The backends we provide
+  __none = 0,
+#if _CCCL_HAS_BACKEND_CUDA()
+  __cuda = 1 << 1,
+#endif // _CCCL_HAS_BACKEND_CUDA()
+#if _CCCL_HAS_BACKEND_OMP()
+  __omp = 1 << 2,
+#endif // _CCCL_HAS_BACKEND_OMP()
+#if _CCCL_HAS_BACKEND_TBB()
+  __tbb = 1 << 3,
+#endif // _CCCL_HAS_BACKEND_TBB()
+};
+
+//! @brief Extracts the execution backend from the stored _Policy
+template <uint32_t _Policy>
+inline constexpr __execution_backend __to_backend = __execution_backend{(_Policy & uint32_t{0x0000FF00}) >> 8};
+
+template <uint32_t _Policy, __execution_backend _Backend = __to_backend<_Policy>>
+struct __execution_policy_base;
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___FWD_EXECUTION_POLICY_H
@@ -23,7 +23,7 @@
 
 #include <cuda/std/__cccl/prologue.h>
 
-#define _CCCL_HAS_BACKEND_CUDA() 0
+#define _CCCL_HAS_BACKEND_CUDA() _CCCL_CUDA_COMPILATION() && !_CCCL_COMPILER(NVRTC)
 #define _CCCL_HAS_BACKEND_OMP()  0
 #define _CCCL_HAS_BACKEND_TBB()  0
 
 
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H
+#define _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cub/device/device_for.cuh>
+
+#  include <cuda/__execution/policy.h>
+#  include <cuda/__runtime/api_wrapper.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__algorithm/for_each_n.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__exception/terminate.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__utility/convert_to_integral.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <nv/target>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <>
+struct __pstl_dispatch<__pstl_algorithm::__for_each_n, __execution_backend::__cuda>
+{
+  template <class _Policy, class _Iter, class _Size, class _Fn>
+  [[nodiscard]] _CCCL_HOST_API static _Iter
+  __par_impl([[maybe_unused]] _Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) noexcept
+  {
+    const auto __count = ::cuda::std::__convert_to_integral(__orig_n);
+    ::cuda::stream_ref __stream{cudaStreamPerThread};
+
+    _CCCL_TRY_CUDA_API(
+      ::cub::DeviceFor::ForEachN,
+      "__pstl_dispatch: kernel launch failed",
+      __first,
+      __count,
+      ::cuda::std::move(__func),
+      __stream.get());
+
+    __stream.sync();
+
+    return __first + __count;
+  }
+
+  template <class _Policy, class _Iter, class _Size, class _Fn>
+  [[nodiscard]] _CCCL_HOST_API _CCCL_FORCEINLINE _Iter
+  operator()(_Policy __policy, _Iter __first, _Size __orig_n, _Fn __func) const noexcept
+  {
+    if constexpr (::cuda::std::__has_random_access_traversal<_Iter>)
+    {
+      return __par_impl(::cuda::std::move(__policy), ::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func));
+    }
+    else
+    {
+      static_assert(__always_false_v<_Policy>,
+                    "__pstl_dispatch: CUDA backend of cuda::std::for_each_n requires at least random access iterators");
+      return ::cuda::std::for_each_n(::cuda::std::move(__first), __orig_n, ::cuda::std::move(__func));
+    }
+  }
+};
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif /// _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA_STD___PSTL_CUDA_FOR_EACH_N_H