-
Notifications
You must be signed in to change notification settings - Fork 233
[ 🚧 Draft] : Adding host-mr for pinned bounce buffer to rmm::device_scalar
#1985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ | |
| #include <rmm/mr/device/per_device_resource.hpp> | ||
| #include <rmm/resource_ref.hpp> | ||
|
|
||
| #include <optional> | ||
| #include <type_traits> | ||
|
|
||
| namespace RMM_NAMESPACE { | ||
|
|
@@ -52,8 +53,22 @@ class device_scalar { | |
| using const_pointer = typename device_uvector<T>::const_pointer; ///< The type of the iterator | ||
| ///< returned by data() const | ||
|
|
||
| /** | ||
| * @brief A struct to configure memory resources for a `device_scalar`. | ||
| * | ||
| * This struct allows for specifying a device memory resource for the scalar's storage and | ||
| * an optional host memory resource for a bounce buffer to optimize host-device transfers. | ||
| */ | ||
| struct memory_resource_args { | ||
| device_async_resource_ref device_mr{mr::get_current_device_resource_ref()}; | ||
| std::optional<host_resource_ref> bounce_buffer_host_mr{std::nullopt}; | ||
| }; | ||
|
|
||
| RMM_EXEC_CHECK_DISABLE | ||
| ~device_scalar() = default; | ||
| ~device_scalar() | ||
| { | ||
| if (_host_bounce_buffer.has_value()) { _host_mr->deallocate(_host_bounce_buffer.value(), sizeof(T)); } | ||
| } | ||
|
|
||
| RMM_EXEC_CHECK_DISABLE | ||
| device_scalar(device_scalar&&) noexcept = default; ///< Default move constructor | ||
|
|
@@ -124,6 +139,31 @@ class device_scalar { | |
| set_value_async(initial_value, stream); | ||
| } | ||
|
|
||
| /** | ||
| * @brief Construct a new `device_scalar` with an initial value. | ||
| * | ||
| * Does not synchronize the stream. | ||
| * | ||
| * @note This device_scalar is only safe to access in kernels and copies on the specified CUDA | ||
| * stream, or on another stream only if a dependency is enforced (e.g. using | ||
| * `cudaStreamWaitEvent()`). | ||
| * | ||
| * @throws rmm::bad_alloc if allocating the device memory for `initial_value` fails. | ||
| * @throws rmm::cuda_error if copying `initial_value` to device memory fails. | ||
| * | ||
| * @param initial_value The initial value of the object in device memory. | ||
| * @param stream Optional, stream on which to perform allocation and copy. | ||
| * @param mr_args Arguments to configure memory resources for a `device_scalar`. | ||
| */ | ||
| explicit device_scalar(value_type const& initial_value, | ||
| cuda_stream_view stream, | ||
| memory_resource_args const& mr_args) | ||
| : _storage{1, stream, mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr} | ||
| { | ||
| if (_host_mr.has_value()) { _host_bounce_buffer = static_cast<T*>(_host_mr->allocate(sizeof(T))); } | ||
| set_value_async(initial_value, stream); | ||
| } | ||
|
|
||
| /** | ||
| * @brief Construct a new `device_scalar` by deep copying the contents of | ||
| * another `device_scalar`, using the specified stream and memory | ||
|
|
@@ -161,7 +201,16 @@ class device_scalar { | |
| */ | ||
| [[nodiscard]] value_type value(cuda_stream_view stream) const | ||
| { | ||
| return _storage.front_element(stream); | ||
| if (_host_bounce_buffer.has_value()) { | ||
| // Case: Copying with pinned host memory | ||
| RMM_CUDA_TRY(cudaMemcpyAsync( | ||
| _host_bounce_buffer.value(), data(), sizeof(T), cudaMemcpyDefault, stream.value())); | ||
| stream.synchronize(); | ||
| return *_host_bounce_buffer.value(); | ||
| } else { | ||
| // Case: Copying with pageable host memory — may trigger an implicit synchronization. | ||
| return _storage.front_element(stream); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -203,7 +252,15 @@ class device_scalar { | |
| */ | ||
| void set_value_async(value_type const& value, cuda_stream_view stream) | ||
| { | ||
| _storage.set_element_async(0, value, stream); | ||
| if (_host_bounce_buffer.has_value()) { | ||
| // Case: Copying with pinned host memory | ||
| *_host_bounce_buffer.value() = value; | ||
| RMM_CUDA_TRY(cudaMemcpyAsync( | ||
| data(), _host_bounce_buffer.value(), sizeof(T), cudaMemcpyDefault, stream.value())); | ||
| } else { | ||
| // Case: Copying with pageable host memory — may trigger an implicit synchronization. | ||
| _storage.set_element_async(0, value, stream); | ||
| } | ||
| } | ||
|
|
||
| // Disallow passing literals to set_value to avoid race conditions where the memory holding the | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe you can do this too now? copy to bounce buffer immediately and then do the copy async
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @devavret , thanks!
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think @devavret is referring to the deleted function below.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get it now. Let’s first discuss where the buffer should be placed, as mentioned above. Once that’s settled, I can address this detailed point. |
||
|
|
@@ -275,6 +332,10 @@ class device_scalar { | |
|
|
||
| private: | ||
| rmm::device_uvector<T> _storage; | ||
| std::optional<host_resource_ref> _host_mr{ | ||
| std::nullopt}; /// Optional host memory resource for bounce buffers | ||
| std::optional<T*> _host_bounce_buffer{ | ||
| std::nullopt}; /// Optional bounce buffer for host-device transfers | ||
| }; | ||
|
|
||
| /** @} */ // end of group | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This makes me wonder if the bounce buffer copying support should be in device_buffer instead
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @harrism, thanks! Yes, that’s also my main concern. It’s why I didn’t prepare a well-considered pull request but only a draft here, as there’s a chance it’ll need to be rewritten again, just like I already did in cuDF.
We can discuss where the buffer should live. cuDF places the buffer in its scalar header, and that’s the behavior I’d like to mimic.
Alternatively, storing it in the RMM devicevector header is also possible, since the element call results in a copy as well. The twin issue is here: #1955