diff --git a/cpp/include/rmm/device_buffer.hpp b/cpp/include/rmm/device_buffer.hpp
index 3e3f92564..aadc2ea7e 100644
--- a/cpp/include/rmm/device_buffer.hpp
+++ b/cpp/include/rmm/device_buffer.hpp
@@ -26,6 +26,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <optional>
 
 namespace RMM_NAMESPACE {
 /**
@@ -80,6 +81,17 @@ namespace RMM_NAMESPACE {
  */
 class device_buffer {
  public:
+  /**
+   * @brief A struct to configure memory resources for a `device_buffer`.
+   *
+   * This struct allows for specifying a device memory resource for the buffer's storage and
+   * an optional host memory resource for a bounce buffer to optimize host-device transfers.
+   */
+  struct memory_resource_args {
+    device_async_resource_ref device_mr{mr::get_current_device_resource_ref()};
+    std::optional<host_resource_ref> bounce_buffer_host_mr{std::nullopt};
+  };
+
   // The copy constructor and copy assignment operator without a stream are deleted because they
   // provide no way to specify an explicit stream
   device_buffer(device_buffer const& other)            = delete;
@@ -107,6 +119,21 @@ class device_buffer {
                          cuda_stream_view stream,
                          device_async_resource_ref mr = mr::get_current_device_resource_ref());
 
+  /**
+   * @brief Constructs a new device buffer of `size` uninitialized bytes
+   *        with optional host bounce buffer
+   *
+   * @throws rmm::bad_alloc If allocation fails.
+   *
+   * @param size Size in bytes to allocate in device memory.
+   * @param stream CUDA stream on which memory may be allocated if the memory
+   * resource supports streams.
+   * @param mr_args Arguments to configure memory resources for a `device_buffer`.
+   */
+  explicit device_buffer(std::size_t size,
+                         cuda_stream_view stream,
+                         memory_resource_args const& mr_args);
+
   /**
    * @brief Construct a new device buffer by copying from a raw pointer to an existing host or
    * device memory allocation.
@@ -131,6 +158,30 @@ class device_buffer {
                 cuda_stream_view stream,
                 device_async_resource_ref mr = mr::get_current_device_resource_ref());
 
+  /**
+   * @brief Construct a new device buffer by copying from a raw pointer to an existing host or
+   * device memory allocation with optional host bounce buffer.
+   *
+   * @note This function does not synchronize `stream`. `source_data` is copied on `stream`, so the
+   * caller is responsible for correct synchronization to ensure that `source_data` is valid when
+   * the copy occurs. This includes destroying `source_data` in stream order after this function is
+   * called, or synchronizing or waiting on `stream` after this function returns as necessary.
+   *
+   * @throws rmm::bad_alloc If creating the new allocation fails.
+   * @throws rmm::logic_error If `source_data` is null, and `size != 0`.
+   * @throws rmm::cuda_error if copying from the device memory fails.
+   *
+   * @param source_data Pointer to the host or device memory to copy from.
+   * @param size Size in bytes to copy.
+   * @param stream CUDA stream on which memory may be allocated if the memory
+   * resource supports streams.
+   * @param mr_args Arguments to configure memory resources for a `device_buffer`.
+   */
+  device_buffer(void const* source_data,
+                std::size_t size,
+                cuda_stream_view stream,
+                memory_resource_args const& mr_args);
+
   /**
    * @brief Construct a new `device_buffer` by deep copying the contents of
    * another `device_buffer`, optionally using the specified stream and memory
@@ -156,6 +207,30 @@ class device_buffer {
                 cuda_stream_view stream,
                 device_async_resource_ref mr = mr::get_current_device_resource_ref());
 
+  /**
+   * @brief Construct a new `device_buffer` by deep copying the contents of
+   * another `device_buffer` with optional host bounce buffer.
+   *
+   * @note Only copies `other.size()` bytes from `other`, i.e., if
+   *`other.size() != other.capacity()`, then the size and capacity of the newly
+   * constructed `device_buffer` will be equal to `other.size()`.
+   *
+   * @note This function does not synchronize `stream`. `other` is copied on `stream`, so the
+   * caller is responsible for correct synchronization to ensure that `other` is valid when
+   * the copy occurs. This includes destroying `other` in stream order after this function is
+   * called, or synchronizing or waiting on `stream` after this function returns as necessary.
+   *
+   * @throws rmm::bad_alloc If creating the new allocation fails.
+   * @throws rmm::cuda_error if copying from `other` fails.
+   *
+   * @param other The `device_buffer` whose contents will be copied
+   * @param stream The stream to use for the allocation and copy
+   * @param mr_args Arguments to configure memory resources for a `device_buffer`.
+   */
+  device_buffer(device_buffer const& other,
+                cuda_stream_view stream,
+                memory_resource_args const& mr_args);
+
   /**
    * @brief Constructs a new `device_buffer` by moving the contents of another
    * `device_buffer` into the newly constructed one.
@@ -332,6 +407,11 @@ class device_buffer {
                                                   ///< allocate/deallocate device memory
   cuda_device_id _device{get_current_cuda_device()};
 
+  std::optional<host_resource_ref> _host_mr{
+    std::nullopt};  ///< Optional host memory resource for bounce buffers
+  std::optional<void*> _host_bounce_buffer{
+    std::nullopt};  ///< Optional bounce buffer for host-device transfers
+
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
    *
diff --git a/cpp/src/device_buffer.cpp b/cpp/src/device_buffer.cpp
index c2857b756..62b3ad1dd 100644
--- a/cpp/src/device_buffer.cpp
+++ b/cpp/src/device_buffer.cpp
@@ -19,6 +19,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cstring>
+
 namespace rmm {
 
 device_buffer::device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
@@ -32,6 +34,15 @@ device_buffer::device_buffer(std::size_t size,
   allocate_async(size);
 }
 
+device_buffer::device_buffer(std::size_t size,
+                             cuda_stream_view stream,
+                             memory_resource_args const& mr_args)
+  : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr}
+{
+  cuda_set_device_raii dev{_device};
+  allocate_async(size);
+}
+
 device_buffer::device_buffer(void const* source_data,
                              std::size_t size,
                              cuda_stream_view stream,
@@ -43,6 +54,17 @@ device_buffer::device_buffer(void const* source_data,
   copy_async(source_data, size);
 }
 
+device_buffer::device_buffer(void const* source_data,
+                             std::size_t size,
+                             cuda_stream_view stream,
+                             memory_resource_args const& mr_args)
+  : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr}
+{
+  cuda_set_device_raii dev{_device};
+  allocate_async(size);
+  copy_async(source_data, size);
+}
+
 device_buffer::device_buffer(device_buffer const& other,
                              cuda_stream_view stream,
                              device_async_resource_ref mr)
@@ -50,19 +72,30 @@ device_buffer::device_buffer(device_buffer const& other,
 {
 }
 
+device_buffer::device_buffer(device_buffer const& other,
+                             cuda_stream_view stream,
+                             memory_resource_args const& mr_args)
+  : device_buffer{other.data(), other.size(), stream, mr_args}
+{
+}
+
 device_buffer::device_buffer(device_buffer&& other) noexcept
   : _data{other._data},
     _size{other._size},
     _capacity{other._capacity},
     _stream{other.stream()},
     _mr{other._mr},
-    _device{other._device}
+    _device{other._device},
+    _host_mr{other._host_mr},
+    _host_bounce_buffer{other._host_bounce_buffer}
 {
   other._data     = nullptr;
   other._size     = 0;
   other._capacity = 0;
   other.set_stream(cuda_stream_view{});
-  other._device = cuda_device_id{-1};
+  other._device             = cuda_device_id{-1};
+  other._host_mr            = std::nullopt;
+  other._host_bounce_buffer = std::nullopt;
 }
 
 device_buffer& device_buffer::operator=(device_buffer&& other) noexcept
@@ -75,14 +108,18 @@ device_buffer& device_buffer::operator=(device_buffer&& other) noexcept
     _size     = other._size;
     _capacity = other._capacity;
     set_stream(other.stream());
-    _mr     = other._mr;
-    _device = other._device;
+    _mr                 = other._mr;
+    _device             = other._device;
+    _host_mr            = other._host_mr;
+    _host_bounce_buffer = other._host_bounce_buffer;
 
     other._data     = nullptr;
     other._size     = 0;
     other._capacity = 0;
     other.set_stream(cuda_stream_view{});
-    other._device = cuda_device_id{-1};
+    other._device             = cuda_device_id{-1};
+    other._host_mr            = std::nullopt;
+    other._host_bounce_buffer = std::nullopt;
   }
   return *this;
 }
@@ -96,14 +133,30 @@ device_buffer::~device_buffer() noexcept
 
 void device_buffer::allocate_async(std::size_t bytes)
 {
-  _size     = bytes;
-  _capacity = bytes;
-  _data     = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
+  auto const old_capacity = _capacity;
+  _size                   = bytes;
+  _capacity               = bytes;
+  _data                   = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
+
+  // Resize host bounce buffer if needed
+  if (_host_mr.has_value() && bytes > 0) {
+    if (_host_bounce_buffer.has_value()) {
+      _host_mr->deallocate(_host_bounce_buffer.value(), old_capacity);
+    }
+    _host_bounce_buffer = _host_mr->allocate(bytes);
+  }
 }
 
 void device_buffer::deallocate_async() noexcept
 {
   if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }
+
+  // Deallocate host bounce buffer if it exists
+  if (_host_bounce_buffer.has_value() && _host_mr.has_value()) {
+    _host_mr->deallocate(_host_bounce_buffer.value(), capacity());
+    _host_bounce_buffer = std::nullopt;
+  }
+
   _size     = 0;
   _capacity = 0;
   _data     = nullptr;
@@ -115,6 +168,20 @@ void device_buffer::copy_async(void const* source, std::size_t bytes)
     RMM_EXPECTS(nullptr != source, "Invalid copy from nullptr.");
     RMM_EXPECTS(nullptr != _data, "Invalid copy to nullptr.");
 
+    if (_host_bounce_buffer.has_value() && _host_mr.has_value()) {
+      // If source is host memory, use bounce buffer for optimized transfer
+      cudaPointerAttributes attributes;
+      cudaError_t result = cudaPointerGetAttributes(&attributes, source);
+      if (result == cudaSuccess && attributes.type == cudaMemoryTypeHost) {
+        RMM_CUDA_TRY(cudaMemcpyAsync(
+          _host_bounce_buffer.value(), source, bytes, cudaMemcpyHostToHost, stream().value()));
+        RMM_CUDA_TRY(cudaMemcpyAsync(
+          _data, _host_bounce_buffer.value(), bytes, cudaMemcpyHostToDevice, stream().value()));
+        return;
+      }
+    }
+
+    // Fallback to direct copy (device-to-device or host-to-device without bounce buffer)
     RMM_CUDA_TRY(cudaMemcpyAsync(_data, source, bytes, cudaMemcpyDefault, stream().value()));
   }
 }
@@ -124,9 +191,18 @@ void device_buffer::reserve(std::size_t new_capacity, cuda_stream_view stream)
   set_stream(stream);
   if (new_capacity > capacity()) {
     cuda_set_device_raii dev{_device};
-    auto tmp            = device_buffer{new_capacity, stream, _mr};
+
+    device_buffer tmp;
+    if (_host_mr.has_value()) {
+      memory_resource_args args{_mr, _host_mr};
+      tmp = device_buffer{new_capacity, stream, args};
+    } else {
+      tmp = device_buffer{new_capacity, stream, _mr};
+    }
+
     auto const old_size = size();
-    RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
+    RMM_CUDA_TRY(cudaMemcpyAsync(
+      tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));  // device-to-device copy
     *this = std::move(tmp);
     _size = old_size;
   }
@@ -141,8 +217,17 @@ void device_buffer::resize(std::size_t new_size, cuda_stream_view stream)
     _size = new_size;
   } else {
     cuda_set_device_raii dev{_device};
-    auto tmp = device_buffer{new_size, stream, _mr};
-    RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
+
+    device_buffer tmp;
+    if (_host_mr.has_value()) {
+      memory_resource_args args{_mr, _host_mr};
+      tmp = device_buffer{new_size, stream, args};
+    } else {
+      tmp = device_buffer{new_size, stream, _mr};
+    }
+
+    RMM_CUDA_TRY(cudaMemcpyAsync(
+      tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));  // device-to-device copy
     *this = std::move(tmp);
   }
 }
@@ -155,8 +240,14 @@ void device_buffer::shrink_to_fit(cuda_stream_view stream)
     // Invoke copy ctor on self which only copies `[0, size())` and swap it
     // with self. The temporary `device_buffer` will hold the old contents
     // which will then be destroyed
-    auto tmp = device_buffer{*this, stream, _mr};
-    std::swap(tmp, *this);
+    device_buffer tmp;
+    if (_host_mr.has_value()) {
+      memory_resource_args args{_mr, _host_mr};
+      tmp = device_buffer{*this, stream, args};
+    } else {
+      tmp = device_buffer{*this, stream, _mr};
+    }
+    *this = std::move(tmp);
   }
 }
 
diff --git a/cpp/tests/device_buffer_tests.cu b/cpp/tests/device_buffer_tests.cu
index 7f17f4bec..85efdb6f0 100644
--- a/cpp/tests/device_buffer_tests.cu
+++ b/cpp/tests/device_buffer_tests.cu
@@ -24,6 +24,7 @@
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/equal.h>
@@ -497,3 +498,157 @@ TYPED_TEST(DeviceBufferTest, SetGetStream)
 
   EXPECT_EQ(buff.stream(), otherstream);
 }
+
+// Bounce buffer tests
+TYPED_TEST(DeviceBufferTest, ConstructorWithBounceBuffer)
+{
+  rmm::mr::pinned_memory_resource host_mr;
+  rmm::device_buffer::memory_resource_args mr_args{
+    rmm::device_async_resource_ref{this->mr}, 
+    std::make_optional<rmm::host_resource_ref>(&host_mr)};
+
+  rmm::device_buffer buff{this->size, this->stream, mr_args};
+  EXPECT_NE(nullptr, buff.data());
+  EXPECT_EQ(this->size, buff.size());
+  EXPECT_EQ(this->size, buff.capacity());
+  EXPECT_EQ(rmm::device_async_resource_ref{this->mr}, buff.memory_resource());
+  EXPECT_EQ(this->stream, buff.stream());
+}
+
+TYPED_TEST(DeviceBufferTest, CopyFromHostWithBounceBuffer)
+{
+  rmm::mr::pinned_memory_resource host_mr;
+  rmm::device_buffer::memory_resource_args mr_args{
+    rmm::device_async_resource_ref{this->mr}, 
+    std::make_optional<rmm::host_resource_ref>(&host_mr)};
+
+  std::vector<uint8_t> host_data(this->size);
+  std::iota(host_data.begin(), host_data.end(), 0);
+
+  rmm::device_buffer buff{
+    static_cast<void*>(host_data.data()), this->size, this->stream, mr_args};
+  
+  EXPECT_NE(nullptr, buff.data());
+  EXPECT_EQ(this->size, buff.size());
+  EXPECT_EQ(this->size, buff.capacity());
+  EXPECT_EQ(rmm::device_async_resource_ref{this->mr}, buff.memory_resource());
+  EXPECT_EQ(this->stream, buff.stream());
+
+  this->stream.synchronize();
+
+  // Verify data was copied correctly by copying back to host
+  std::vector<uint8_t> result_data(this->size);
+  RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost));
+  
+  EXPECT_TRUE(std::equal(host_data.begin(), host_data.end(), result_data.begin()));
+}
+
+TYPED_TEST(DeviceBufferTest, ReserveWithBounceBuffer)
+{
+  rmm::mr::pinned_memory_resource host_mr;
+  rmm::device_buffer::memory_resource_args mr_args{
+    rmm::device_async_resource_ref{this->mr}, 
+    std::make_optional<rmm::host_resource_ref>(&host_mr)};
+
+  rmm::device_buffer buff{this->size, this->stream, mr_args};
+  
+  // Initialize buffer with test data
+  thrust::sequence(rmm::exec_policy(this->stream),
+                   static_cast<uint8_t*>(buff.data()),
+                   static_cast<uint8_t*>(buff.data()) + buff.size(),
+                   0);
+
+  auto* old_data = buff.data();
+  auto new_capacity = this->size * 2;
+  
+  buff.reserve(new_capacity, this->stream);
+  
+  EXPECT_EQ(this->size, buff.size());
+  EXPECT_EQ(new_capacity, buff.capacity());
+  EXPECT_NE(old_data, buff.data());  // Should have reallocated
+
+  this->stream.synchronize();
+
+  // Verify data was preserved during reserve operation
+  std::vector<uint8_t> expected_data(this->size);
+  std::iota(expected_data.begin(), expected_data.end(), 0);
+  
+  std::vector<uint8_t> result_data(this->size);
+  RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost));
+  
+  EXPECT_TRUE(std::equal(expected_data.begin(), expected_data.end(), result_data.begin()));
+}
+
+TYPED_TEST(DeviceBufferTest, ResizeWithBounceBuffer)
+{
+  rmm::mr::pinned_memory_resource host_mr;
+  rmm::device_buffer::memory_resource_args mr_args{
+    rmm::device_async_resource_ref{this->mr}, 
+    std::make_optional<rmm::host_resource_ref>(&host_mr)};
+
+  rmm::device_buffer buff{this->size, this->stream, mr_args};
+  
+  // Initialize buffer with test data
+  thrust::sequence(rmm::exec_policy(this->stream),
+                   static_cast<uint8_t*>(buff.data()),
+                   static_cast<uint8_t*>(buff.data()) + buff.size(),
+                   0);
+
+  auto* old_data = buff.data();
+  auto new_size = this->size * 2;
+  
+  buff.resize(new_size, this->stream);
+  
+  EXPECT_EQ(new_size, buff.size());
+  EXPECT_EQ(new_size, buff.capacity());
+  EXPECT_NE(old_data, buff.data());  // Should have reallocated
+
+  this->stream.synchronize();
+
+  // Verify original data was preserved during resize operation
+  std::vector<uint8_t> expected_data(this->size);
+  std::iota(expected_data.begin(), expected_data.end(), 0);
+  
+  std::vector<uint8_t> result_data(this->size);
+  RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost));
+  
+  EXPECT_TRUE(std::equal(expected_data.begin(), expected_data.end(), result_data.begin()));
+}
+
+TYPED_TEST(DeviceBufferTest, ShrinkToFitWithBounceBuffer)
+{
+  rmm::mr::pinned_memory_resource host_mr;
+  rmm::device_buffer::memory_resource_args mr_args{
+    rmm::device_async_resource_ref{this->mr}, 
+    std::make_optional<rmm::host_resource_ref>(&host_mr)};
+
+  rmm::device_buffer buff{this->size, this->stream, mr_args};
+  
+  // Reserve more capacity than needed
+  buff.reserve(this->size * 2, this->stream);
+  
+  // Initialize buffer with test data
+  thrust::sequence(rmm::exec_policy(this->stream),
+                   static_cast<uint8_t*>(buff.data()),
+                   static_cast<uint8_t*>(buff.data()) + buff.size(),
+                   0);
+
+  auto* old_data = buff.data();
+  
+  buff.shrink_to_fit(this->stream);
+  
+  EXPECT_EQ(this->size, buff.size());
+  EXPECT_EQ(this->size, buff.capacity());  // Capacity should match size
+  EXPECT_NE(old_data, buff.data());  // Should have reallocated
+
+  this->stream.synchronize();
+
+  // Verify data was preserved during shrink operation
+  std::vector<uint8_t> expected_data(this->size);
+  std::iota(expected_data.begin(), expected_data.end(), 0);
+  
+  std::vector<uint8_t> result_data(this->size);
+  RMM_CUDA_TRY(cudaMemcpy(result_data.data(), buff.data(), this->size, cudaMemcpyDeviceToHost));
+  
+  EXPECT_TRUE(std::equal(expected_data.begin(), expected_data.end(), result_data.begin()));
+}
\ No newline at end of file