Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 64 additions & 3 deletions cpp/include/rmm/device_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <optional>
#include <type_traits>

namespace RMM_NAMESPACE {
Expand Down Expand Up @@ -52,8 +53,22 @@ class device_scalar {
using const_pointer = typename device_uvector<T>::const_pointer; ///< The type of the iterator
///< returned by data() const

/**
* @brief A struct to configure memory resources for a `device_scalar`.
*
* This struct allows for specifying a device memory resource for the scalar's storage and
* an optional host memory resource for a bounce buffer to optimize host-device transfers.
*/
struct memory_resource_args {
device_async_resource_ref device_mr{mr::get_current_device_resource_ref()};
std::optional<host_resource_ref> bounce_buffer_host_mr{std::nullopt};
};

RMM_EXEC_CHECK_DISABLE
~device_scalar() = default;
~device_scalar()
{
if (_host_bounce_buffer.has_value()) { _host_mr->deallocate(_host_bounce_buffer.value(), sizeof(T)); }
}

RMM_EXEC_CHECK_DISABLE
device_scalar(device_scalar&&) noexcept = default; ///< Default move constructor
Expand Down Expand Up @@ -124,6 +139,31 @@ class device_scalar {
set_value_async(initial_value, stream);
}

/**
* @brief Construct a new `device_scalar` with an initial value.
*
* Does not synchronize the stream.
*
* @note This device_scalar is only safe to access in kernels and copies on the specified CUDA
* stream, or on another stream only if a dependency is enforced (e.g. using
* `cudaStreamWaitEvent()`).
*
* @throws rmm::bad_alloc if allocating the device memory for `initial_value` fails.
* @throws rmm::cuda_error if copying `initial_value` to device memory fails.
*
* @param initial_value The initial value of the object in device memory.
* @param stream Optional, stream on which to perform allocation and copy.
* @param mr_args Arguments to configure memory resources for a `device_scalar`.
*/
explicit device_scalar(value_type const& initial_value,
cuda_stream_view stream,
memory_resource_args const& mr_args)
: _storage{1, stream, mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr}
{
if (_host_mr.has_value()) { _host_bounce_buffer = static_cast<T*>(_host_mr->allocate(sizeof(T))); }
set_value_async(initial_value, stream);
}

/**
* @brief Construct a new `device_scalar` by deep copying the contents of
* another `device_scalar`, using the specified stream and memory
Expand Down Expand Up @@ -161,7 +201,16 @@ class device_scalar {
*/
[[nodiscard]] value_type value(cuda_stream_view stream) const
{
return _storage.front_element(stream);
if (_host_bounce_buffer.has_value()) {
// Case: Copying with pinned host memory
RMM_CUDA_TRY(cudaMemcpyAsync(
_host_bounce_buffer.value(), data(), sizeof(T), cudaMemcpyDefault, stream.value()));
stream.synchronize();
return *_host_bounce_buffer.value();
} else {
// Case: Copying with pageable host memory — may trigger an implicit synchronization.
return _storage.front_element(stream);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes me wonder if the bounce buffer copying support should be in device_buffer instead

Copy link
Contributor Author

@JigaoLuo JigaoLuo Jul 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @harrism, thanks! Yes, that’s also my main concern. It’s why I didn’t prepare a well-considered pull request but only a draft here, as there’s a chance it’ll need to be rewritten again, just like I already did in cuDF.

We can discuss where the buffer should live. cuDF places the buffer in its scalar header, and that’s the behavior I’d like to mimic.
Alternatively, storing it in the RMM devicevector header is also possible, since the element call results in a copy as well. The twin issue is here: #1955

}
}

/**
Expand Down Expand Up @@ -203,7 +252,15 @@ class device_scalar {
*/
void set_value_async(value_type const& value, cuda_stream_view stream)
{
_storage.set_element_async(0, value, stream);
if (_host_bounce_buffer.has_value()) {
// Case: Copying with pinned host memory
*_host_bounce_buffer.value() = value;
RMM_CUDA_TRY(cudaMemcpyAsync(
data(), _host_bounce_buffer.value(), sizeof(T), cudaMemcpyDefault, stream.value()));
} else {
// Case: Copying with pageable host memory — may trigger an implicit synchronization.
_storage.set_element_async(0, value, stream);
}
}

// Disallow passing literals to set_value to avoid race conditions where the memory holding the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you can do this too now? copy to bounce buffer immediately and then do the copy async

Copy link
Contributor Author

@JigaoLuo JigaoLuo Jul 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @devavret , thanks!
Sorry. I wasn’t sure I understood you correctly.
These 3 lines refer to the else case, where no host-pinned bounce buffer is allocated. That’s actually the situation we’re currently only having in RMM.
Do you mean we just allocate a buffer for every function call of this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think @devavret is referring to the deleted function below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get it now. Let’s first discuss where the buffer should be placed, as mentioned above. Once that’s settled, I can address this detailed point.

Expand Down Expand Up @@ -275,6 +332,10 @@ class device_scalar {

private:
rmm::device_uvector<T> _storage;
std::optional<host_resource_ref> _host_mr{
std::nullopt}; /// Optional host memory resource for bounce buffers
std::optional<T*> _host_bounce_buffer{
std::nullopt}; /// Optional bounce buffer for host-device transfers
};

/** @} */ // end of group
Expand Down
25 changes: 25 additions & 0 deletions cpp/tests/device_scalar_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <rmm/device_scalar.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/mr/host/pinned_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda_runtime_api.h>
Expand Down Expand Up @@ -122,6 +123,30 @@ TYPED_TEST(DeviceScalarTest, MoveCtor)
EXPECT_EQ(nullptr, scalar.data());
}

TYPED_TEST(DeviceScalarTest, InitialValueWithBounceBuffer)
{
rmm::mr::pinned_memory_resource host_mr;
typename rmm::device_scalar<TypeParam>::memory_resource_args mr_args{
this->mr, std::make_optional<rmm::host_resource_ref>(&host_mr)};

rmm::device_scalar<TypeParam> scalar{this->value, this->stream, mr_args};
EXPECT_NE(nullptr, scalar.data());
EXPECT_EQ(this->value, scalar.value(this->stream));
}

TYPED_TEST(DeviceScalarTest, SetValueWithBounceBuffer)
{
rmm::mr::pinned_memory_resource host_mr;
typename rmm::device_scalar<TypeParam>::memory_resource_args mr_args{
this->mr, std::make_optional<rmm::host_resource_ref>(&host_mr)};
rmm::device_scalar<TypeParam> scalar{this->value, this->stream, mr_args};

EXPECT_NE(nullptr, scalar.data());
auto expected = this->random_value();
scalar.set_value_async(expected, this->stream);
EXPECT_EQ(expected, scalar.value(this->stream));
}

TYPED_TEST(DeviceScalarTest, SetValue)
{
rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
Expand Down