diff --git a/cpp/include/rmm/device_buffer.hpp b/cpp/include/rmm/device_buffer.hpp index 3e3f92564..aadc2ea7e 100644 --- a/cpp/include/rmm/device_buffer.hpp +++ b/cpp/include/rmm/device_buffer.hpp @@ -26,6 +26,7 @@ #include #include +#include namespace RMM_NAMESPACE { /** @@ -80,6 +81,17 @@ namespace RMM_NAMESPACE { */ class device_buffer { public: + /** + * @brief A struct to configure memory resources for a `device_buffer`. + * + * This struct allows for specifying a device memory resource for the buffer's storage and + * an optional host memory resource for a bounce buffer to optimize host-device transfers. + */ + struct memory_resource_args { + device_async_resource_ref device_mr{mr::get_current_device_resource_ref()}; + std::optional bounce_buffer_host_mr{std::nullopt}; + }; + // The copy constructor and copy assignment operator without a stream are deleted because they // provide no way to specify an explicit stream device_buffer(device_buffer const& other) = delete; @@ -107,6 +119,21 @@ class device_buffer { cuda_stream_view stream, device_async_resource_ref mr = mr::get_current_device_resource_ref()); + /** + * @brief Constructs a new device buffer of `size` uninitialized bytes + * with optional host bounce buffer + * + * @throws rmm::bad_alloc If allocation fails. + * + * @param size Size in bytes to allocate in device memory. + * @param stream CUDA stream on which memory may be allocated if the memory + * resource supports streams. + * @param mr_args Arguments to configure memory resources for a `device_buffer`. + */ + explicit device_buffer(std::size_t size, + cuda_stream_view stream, + memory_resource_args const& mr_args); + /** * @brief Construct a new device buffer by copying from a raw pointer to an existing host or * device memory allocation. @@ -131,6 +158,30 @@ class device_buffer { cuda_stream_view stream, device_async_resource_ref mr = mr::get_current_device_resource_ref()); + /** + * @brief Construct a new device buffer by copying from a raw pointer to an existing host or + * device memory allocation with optional host bounce buffer. + * + * @note This function does not synchronize `stream`. `source_data` is copied on `stream`, so the + * caller is responsible for correct synchronization to ensure that `source_data` is valid when + * the copy occurs. This includes destroying `source_data` in stream order after this function is + * called, or synchronizing or waiting on `stream` after this function returns as necessary. + * + * @throws rmm::bad_alloc If creating the new allocation fails. + * @throws rmm::logic_error If `source_data` is null, and `size != 0`. + * @throws rmm::cuda_error if copying from the device memory fails. + * + * @param source_data Pointer to the host or device memory to copy from. + * @param size Size in bytes to copy. + * @param stream CUDA stream on which memory may be allocated if the memory + * resource supports streams. + * @param mr_args Arguments to configure memory resources for a `device_buffer`. + */ + device_buffer(void const* source_data, + std::size_t size, + cuda_stream_view stream, + memory_resource_args const& mr_args); + /** * @brief Construct a new `device_buffer` by deep copying the contents of * another `device_buffer`, optionally using the specified stream and memory @@ -156,6 +207,30 @@ class device_buffer { cuda_stream_view stream, device_async_resource_ref mr = mr::get_current_device_resource_ref()); + /** + * @brief Construct a new `device_buffer` by deep copying the contents of + * another `device_buffer` with optional host bounce buffer. + * + * @note Only copies `other.size()` bytes from `other`, i.e., if + *`other.size() != other.capacity()`, then the size and capacity of the newly + * constructed `device_buffer` will be equal to `other.size()`. + * + * @note This function does not synchronize `stream`. `other` is copied on `stream`, so the + * caller is responsible for correct synchronization to ensure that `other` is valid when + * the copy occurs. This includes destroying `other` in stream order after this function is + * called, or synchronizing or waiting on `stream` after this function returns as necessary. + * + * @throws rmm::bad_alloc If creating the new allocation fails. + * @throws rmm::cuda_error if copying from `other` fails. + * + * @param other The `device_buffer` whose contents will be copied + * @param stream The stream to use for the allocation and copy + * @param mr_args Arguments to configure memory resources for a `device_buffer`. + */ + device_buffer(device_buffer const& other, + cuda_stream_view stream, + memory_resource_args const& mr_args); + /** * @brief Constructs a new `device_buffer` by moving the contents of another * `device_buffer` into the newly constructed one. @@ -332,6 +407,11 @@ class device_buffer { ///< allocate/deallocate device memory cuda_device_id _device{get_current_cuda_device()}; + std::optional _host_mr{ + std::nullopt}; ///< Optional host memory resource for bounce buffers + std::optional _host_bounce_buffer{ + std::nullopt}; ///< Optional bounce buffer for host-device transfers + /** * @brief Allocates the specified amount of memory and updates the size/capacity accordingly. * diff --git a/cpp/src/device_buffer.cpp b/cpp/src/device_buffer.cpp index c2857b756..62b3ad1dd 100644 --- a/cpp/src/device_buffer.cpp +++ b/cpp/src/device_buffer.cpp @@ -19,6 +19,8 @@ #include +#include + namespace rmm { device_buffer::device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {} @@ -32,6 +34,15 @@ device_buffer::device_buffer(std::size_t size, allocate_async(size); } +device_buffer::device_buffer(std::size_t size, + cuda_stream_view stream, + memory_resource_args const& mr_args) + : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr} +{ + cuda_set_device_raii dev{_device}; + allocate_async(size); +} + device_buffer::device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, @@ -43,6 +54,17 @@ device_buffer::device_buffer(void const* source_data, copy_async(source_data, size); } +device_buffer::device_buffer(void const* source_data, + std::size_t size, + cuda_stream_view stream, + memory_resource_args const& mr_args) + : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr} +{ + cuda_set_device_raii dev{_device}; + allocate_async(size); + copy_async(source_data, size); +} + device_buffer::device_buffer(device_buffer const& other, cuda_stream_view stream, device_async_resource_ref mr) @@ -50,19 +72,30 @@ device_buffer::device_buffer(device_buffer const& other, { } +device_buffer::device_buffer(device_buffer const& other, + cuda_stream_view stream, + memory_resource_args const& mr_args) + : device_buffer{other.data(), other.size(), stream, mr_args} +{ +} + device_buffer::device_buffer(device_buffer&& other) noexcept : _data{other._data}, _size{other._size}, _capacity{other._capacity}, _stream{other.stream()}, _mr{other._mr}, - _device{other._device} + _device{other._device}, + _host_mr{other._host_mr}, + _host_bounce_buffer{other._host_bounce_buffer} { other._data = nullptr; other._size = 0; other._capacity = 0; other.set_stream(cuda_stream_view{}); - other._device = cuda_device_id{-1}; + other._device = cuda_device_id{-1}; + other._host_mr = std::nullopt; + other._host_bounce_buffer = std::nullopt; } device_buffer& device_buffer::operator=(device_buffer&& other) noexcept @@ -75,14 +108,18 @@ device_buffer& device_buffer::operator=(device_buffer&& other) noexcept _size = other._size; _capacity = other._capacity; set_stream(other.stream()); - _mr = other._mr; - _device = other._device; + _mr = other._mr; + _device = other._device; + _host_mr = other._host_mr; + _host_bounce_buffer = other._host_bounce_buffer; other._data = nullptr; other._size = 0; other._capacity = 0; other.set_stream(cuda_stream_view{}); - other._device = cuda_device_id{-1}; + other._device = cuda_device_id{-1}; + other._host_mr = std::nullopt; + other._host_bounce_buffer = std::nullopt; } return *this; } @@ -96,14 +133,30 @@ device_buffer::~device_buffer() noexcept void device_buffer::allocate_async(std::size_t bytes) { - _size = bytes; - _capacity = bytes; - _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; + auto const old_capacity = _capacity; + _size = bytes; + _capacity = bytes; + _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; + + // Resize host bounce buffer if needed + if (_host_mr.has_value() && bytes > 0) { + if (_host_bounce_buffer.has_value()) { + _host_mr->deallocate(_host_bounce_buffer.value(), old_capacity); + } + _host_bounce_buffer = _host_mr->allocate(bytes); + } } void device_buffer::deallocate_async() noexcept { if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); } + + // Deallocate host bounce buffer if it exists + if (_host_bounce_buffer.has_value() && _host_mr.has_value()) { + _host_mr->deallocate(_host_bounce_buffer.value(), capacity()); + _host_bounce_buffer = std::nullopt; + } + _size = 0; _capacity = 0; _data = nullptr; @@ -115,6 +168,20 @@ void device_buffer::copy_async(void const* source, std::size_t bytes) RMM_EXPECTS(nullptr != source, "Invalid copy from nullptr."); RMM_EXPECTS(nullptr != _data, "Invalid copy to nullptr."); + if (_host_bounce_buffer.has_value() && _host_mr.has_value()) { + // If source is host memory, use bounce buffer for optimized transfer + cudaPointerAttributes attributes; + cudaError_t result = cudaPointerGetAttributes(&attributes, source); + if (result == cudaSuccess && attributes.type == cudaMemoryTypeHost) { + RMM_CUDA_TRY(cudaMemcpyAsync( + _host_bounce_buffer.value(), source, bytes, cudaMemcpyHostToHost, stream().value())); + RMM_CUDA_TRY(cudaMemcpyAsync( + _data, _host_bounce_buffer.value(), bytes, cudaMemcpyHostToDevice, stream().value())); + return; + } + } + + // Fallback to direct copy (device-to-device or host-to-device without bounce buffer) RMM_CUDA_TRY(cudaMemcpyAsync(_data, source, bytes, cudaMemcpyDefault, stream().value())); } } @@ -124,9 +191,18 @@ void device_buffer::reserve(std::size_t new_capacity, cuda_stream_view stream) set_stream(stream); if (new_capacity > capacity()) { cuda_set_device_raii dev{_device}; - auto tmp = device_buffer{new_capacity, stream, _mr}; + + device_buffer tmp; + if (_host_mr.has_value()) { + memory_resource_args args{_mr, _host_mr}; + tmp = device_buffer{new_capacity, stream, args}; + } else { + tmp = device_buffer{new_capacity, stream, _mr}; + } + auto const old_size = size(); - RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); + RMM_CUDA_TRY(cudaMemcpyAsync( + tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); // device-to-device copy *this = std::move(tmp); _size = old_size; } @@ -141,8 +217,17 @@ void device_buffer::resize(std::size_t new_size, cuda_stream_view stream) _size = new_size; } else { cuda_set_device_raii dev{_device}; - auto tmp = device_buffer{new_size, stream, _mr}; - RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); + + device_buffer tmp; + if (_host_mr.has_value()) { + memory_resource_args args{_mr, _host_mr}; + tmp = device_buffer{new_size, stream, args}; + } else { + tmp = device_buffer{new_size, stream, _mr}; + } + + RMM_CUDA_TRY(cudaMemcpyAsync( + tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); // device-to-device copy *this = std::move(tmp); } } @@ -155,8 +240,14 @@ void device_buffer::shrink_to_fit(cuda_stream_view stream) // Invoke copy ctor on self which only copies `[0, size())` and swap it // with self. The temporary `device_buffer` will hold the old contents // which will then be destroyed - auto tmp = device_buffer{*this, stream, _mr}; - std::swap(tmp, *this); + device_buffer tmp; + if (_host_mr.has_value()) { + memory_resource_args args{_mr, _host_mr}; + tmp = device_buffer{*this, stream, args}; + } else { + tmp = device_buffer{*this, stream, _mr}; + } + *this = std::move(tmp); } } diff --git a/cpp/tests/device_buffer_tests.cu b/cpp/tests/device_buffer_tests.cu index 7f17f4bec..85efdb6f0 100644 --- a/cpp/tests/device_buffer_tests.cu +++ b/cpp/tests/device_buffer_tests.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -497,3 +498,157 @@ TYPED_TEST(DeviceBufferTest, SetGetStream) EXPECT_EQ(buff.stream(), otherstream); } + +// Bounce buffer tests +TYPED_TEST(DeviceBufferTest, ConstructorWithBounceBuffer) +{ + rmm::mr::pinned_memory_resource host_mr; + rmm::device_buffer::memory_resource_args mr_args{ + rmm::device_async_resource_ref{this->mr}, + std::make_optional(&host_mr)}; + + rmm::device_buffer buff{this->size, this->stream, mr_args}; + EXPECT_NE(nullptr, buff.data()); + EXPECT_EQ(this->size, buff.size()); + EXPECT_EQ(this->size, buff.capacity()); + EXPECT_EQ(rmm::device_async_resource_ref{this->mr}, buff.memory_resource()); + EXPECT_EQ(this->stream, buff.stream()); +} + +TYPED_TEST(DeviceBufferTest, CopyFromHostWithBounceBuffer) +{ + rmm::mr::pinned_memory_resource host_mr; + rmm::device_buffer::memory_resource_args mr_args{ + rmm::device_async_resource_ref{this->mr}, + std::make_optional(&host_mr)}; + + std::vector host_data(this->size); + std::iota(host_data.begin(), host_data.end(), 0); + + rmm::device_buffer buff{ + static_cast(host_data.data()), this->size, this->stream, mr_args}; + + EXPECT_NE(nullptr, buff.data()); + EXPECT_EQ(this->size, buff.size()); + EXPECT_EQ(this->size, buff.capacity()); + EXPECT_EQ(rmm::device_async_resource_ref{this->mr}, buff.memory_resource()); + EXPECT_EQ(this->stream, buff.stream()); + + this->stream.synchronize(); + + // Verify data was copied correctly by copying back to host + std::vector result_data(this->size); + RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost)); + + EXPECT_TRUE(std::equal(host_data.begin(), host_data.end(), result_data.begin())); +} + +TYPED_TEST(DeviceBufferTest, ReserveWithBounceBuffer) +{ + rmm::mr::pinned_memory_resource host_mr; + rmm::device_buffer::memory_resource_args mr_args{ + rmm::device_async_resource_ref{this->mr}, + std::make_optional(&host_mr)}; + + rmm::device_buffer buff{this->size, this->stream, mr_args}; + + // Initialize buffer with test data + thrust::sequence(rmm::exec_policy(this->stream), + static_cast(buff.data()), + static_cast(buff.data()) + buff.size(), + 0); + + auto* old_data = buff.data(); + auto new_capacity = this->size * 2; + + buff.reserve(new_capacity, this->stream); + + EXPECT_EQ(this->size, buff.size()); + EXPECT_EQ(new_capacity, buff.capacity()); + EXPECT_NE(old_data, buff.data()); // Should have reallocated + + this->stream.synchronize(); + + // Verify data was preserved during reserve operation + std::vector expected_data(this->size); + std::iota(expected_data.begin(), expected_data.end(), 0); + + std::vector result_data(this->size); + RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost)); + + EXPECT_TRUE(std::equal(expected_data.begin(), expected_data.end(), result_data.begin())); +} + +TYPED_TEST(DeviceBufferTest, ResizeWithBounceBuffer) +{ + rmm::mr::pinned_memory_resource host_mr; + rmm::device_buffer::memory_resource_args mr_args{ + rmm::device_async_resource_ref{this->mr}, + std::make_optional(&host_mr)}; + + rmm::device_buffer buff{this->size, this->stream, mr_args}; + + // Initialize buffer with test data + thrust::sequence(rmm::exec_policy(this->stream), + static_cast(buff.data()), + static_cast(buff.data()) + buff.size(), + 0); + + auto* old_data = buff.data(); + auto new_size = this->size * 2; + + buff.resize(new_size, this->stream); + + EXPECT_EQ(new_size, buff.size()); + EXPECT_EQ(new_size, buff.capacity()); + EXPECT_NE(old_data, buff.data()); // Should have reallocated + + this->stream.synchronize(); + + // Verify original data was preserved during resize operation + std::vector expected_data(this->size); + std::iota(expected_data.begin(), expected_data.end(), 0); + + std::vector result_data(this->size); + RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost)); + + EXPECT_TRUE(std::equal(expected_data.begin(), expected_data.end(), result_data.begin())); +} + +TYPED_TEST(DeviceBufferTest, ShrinkToFitWithBounceBuffer) +{ + rmm::mr::pinned_memory_resource host_mr; + rmm::device_buffer::memory_resource_args mr_args{ + rmm::device_async_resource_ref{this->mr}, + std::make_optional(&host_mr)}; + + rmm::device_buffer buff{this->size, this->stream, mr_args}; + + // Reserve more capacity than needed + buff.reserve(this->size * 2, this->stream); + + // Initialize buffer with test data + thrust::sequence(rmm::exec_policy(this->stream), + static_cast(buff.data()), + static_cast(buff.data()) + buff.size(), + 0); + + auto* old_data = buff.data(); + + buff.shrink_to_fit(this->stream); + + EXPECT_EQ(this->size, buff.size()); + EXPECT_EQ(this->size, buff.capacity()); // Capacity should match size + EXPECT_NE(old_data, buff.data()); // Should have reallocated + + this->stream.synchronize(); + + // Verify data was preserved during shrink operation + std::vector expected_data(this->size); + std::iota(expected_data.begin(), expected_data.end(), 0); + + std::vector result_data(this->size); + RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost)); + + EXPECT_TRUE(std::equal(expected_data.begin(), expected_data.end(), result_data.begin())); +} \ No newline at end of file