Provide a way to specify a stream

miscco · miscco · commit 2b9f630b5369 · 2025-10-27T15:09:52.000+01:00
diff --git a/libcudacxx/include/cuda/std/__execution/policy.h b/libcudacxx/include/cuda/std/__execution/policy.h
@@ -22,6 +22,8 @@
 
 #include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__execution/stream_policy.h>
+#include <cuda/std/__fwd/policy.h>
 #include <cuda/std/cstdint>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -98,13 +100,6 @@ struct __execution_policy_base
   }
 #endif // _CCCL_STD_VER <= 2017
 
-#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
-  [[nodiscard]] _CCCL_HOST_API static ::cuda::stream_ref get_stream() noexcept
-  {
-    return ::cuda::stream_ref{cudaStreamPerThread};
-  }
-#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
-
   //! @brief Tag that identifies this and all derived classes as a CCCL execution policy
   static constexpr uint32_t __cccl_policy_ = _Policy;
 
@@ -152,6 +147,19 @@ struct __execution_policy_base
     constexpr uint32_t __direction_mask{0xFF00FFFF};
     return (_Policy & __direction_mask) & (static_cast<uint32_t>(__pol) << 16);
   }
+
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+  [[nodiscard]] _CCCL_HOST_API static ::cuda::stream_ref get_stream() noexcept
+  {
+    return ::cuda::stream_ref{cudaStreamPerThread};
+  }
+
+  [[nodiscard]] _CCCL_HOST_API static __execution_policy_stream<__execution_policy_base>
+  on(::cuda::stream_ref __stream) noexcept
+  {
+    return __execution_policy_stream<__execution_policy_base>{__stream};
+  }
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 };
 
 using sequenced_policy = __execution_policy_base<static_cast<uint32_t>(__execution_policy::__sequenced)>;
diff --git a/libcudacxx/include/cuda/std/__execution/stream_policy.h b/libcudacxx/include/cuda/std/__execution/stream_policy.h
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___EXECUTION_STREAM_POLICY_H
+#define _CUDA_STD___EXECUTION_STREAM_POLICY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__bit/has_single_bit.h>
+#  include <cuda/std/__fwd/policy.h>
+#  include <cuda/std/cstdint>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_EXECUTION
+
+//! @brief Wrapper around an execution policy to store a stream
+template <class _Policy>
+class __execution_policy_stream : public _Policy
+{
+private:
+  ::cuda::stream_ref __stream_;
+
+public:
+  _CCCL_HOST_API __execution_policy_stream(::cuda::stream_ref __stream) noexcept
+      : __stream_(__stream)
+  {}
+
+  [[nodiscard]] _CCCL_HOST_API ::cuda::stream_ref get_stream() const noexcept
+  {
+    return __stream_;
+  }
+};
+
+_CCCL_END_NAMESPACE_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___EXECUTION_STREAM_POLICY_H
diff --git a/libcudacxx/include/cuda/std/__fwd/policy.h b/libcudacxx/include/cuda/std/__fwd/policy.h
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___FWD_POLICY_H
+#define _CUDA_STD___FWD_POLICY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_EXECUTION
+
+template <uint32_t _Policy>
+struct __execution_policy_base;
+
+_CCCL_END_NAMESPACE_EXECUTION
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___FWD_POLICY_H
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/execution>
+#include <cuda/std/type_traits>
+#include <cuda/stream>
+
+template <class Policy>
+void test(Policy pol)
+{
+  { // Ensure that the plain policy returns a well defined stream
+    cuda::stream_ref expected_stream{cudaStreamPerThread};
+    assert(cuda::get_stream(pol) == expected_stream);
+  }
+
+  { // Ensure that we can attach a stream to an execution policy
+    cuda::stream stream{cuda::device_ref{0}};
+    auto pol_with_stream = pol.on(stream);
+    assert(cuda::get_stream(pol_with_stream) == stream);
+
+    static_assert(noexcept(pol.on(stream)));
+    static_assert(cuda::std::is_base_of_v<Policy, decltype(pol_with_stream)>);
+    static_assert(cuda::std::is_execution_policy_v<decltype(pol_with_stream)>);
+  }
+}
+
+void test()
+{
+  test(cuda::std::execution::seq);
+  test(cuda::std::execution::par);
+  test(cuda::std::execution::unseq);
+  test(cuda::std::execution::par_unseq);
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each.cu
@@ -22,6 +22,7 @@
 #include <cuda/std/__pstl/for_each.h>
 #include <cuda/std/execution>
 #include <cuda/std/functional>
+#include <cuda/stream>
 
 #include <testing.cuh>
 #include <utility.cuh>
@@ -41,9 +42,23 @@ struct mark_present_for_each
 
 C2H_TEST("cuda::std::for_each", "[parallel algorithm]")
 {
-  thrust::device_vector<bool> res(size, false);
-  mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+  SECTION("with default stream")
+  {
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    cuda::std::for_each(cuda::std::execution::par_unseq, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 
-  cuda::std::for_each(cuda::std::execution::par_unseq, cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn);
-  CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  SECTION("with unique stream")
+  {
+    ::cuda::stream stream{::cuda::device_ref{0}};
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    cuda::std::for_each(
+      cuda::std::execution::par_unseq.on(stream), cuda::counting_iterator{0}, cuda::counting_iterator{size}, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 }
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.for_each/pstl_for_each_n.cu
@@ -24,6 +24,7 @@
 #include <cuda/std/__pstl/for_each_n.h>
 #include <cuda/std/execution>
 #include <cuda/std/functional>
+#include <cuda/stream>
 
 #include <testing.cuh>
 #include <utility.cuh>
@@ -43,9 +44,22 @@ struct mark_present_for_each
 
 C2H_TEST("cuda::std::for_each_n", "[parallel algorithm]")
 {
-  thrust::device_vector<bool> res(size, false);
-  mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+  SECTION("with default stream")
+  {
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    cuda::std::for_each_n(cuda::std::execution::par_unseq, cuda::counting_iterator{0}, size, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 
-  cuda::std::for_each_n(cuda::std::execution::par_unseq, cuda::counting_iterator{0}, size, fn);
-  CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  SECTION("with unique stream")
+  {
+    ::cuda::stream stream{::cuda::device_ref{0}};
+    thrust::device_vector<bool> res(size, false);
+    mark_present_for_each fn{thrust::raw_pointer_cast(res.data())};
+
+    cuda::std::for_each_n(cuda::std::execution::par_unseq.on(stream), cuda::counting_iterator{0}, size, fn);
+    CHECK(thrust::all_of(res.begin(), res.end(), cuda::std::identity{}));
+  }
 }