2121#include < vector>
2222
2323#include < benchmark/benchmark.h>
24+ #include < cuda_runtime.h>
2425
2526#include < rmm/cuda_stream.hpp>
2627
2728#include < rapidsmpf/buffer/pinned_memory_resource.hpp>
2829
2930namespace {
3031
32+ // inspired by
33+ // https://github.com/rapidsai/rmm/blob/branch-25.12/cpp/benchmarks/async_priming/async_priming_bench.cpp
34+ // benchmark
35+
3136/* *
3237 * @brief Factory function to create a cuda_async_memory_resource with priming
3338 */
@@ -133,7 +138,9 @@ void BM_AsyncPrimingImpact(
133138 * @brief Benchmark to measure construction time with and without priming
134139 */
135140template <typename MRFactoryFunc>
136- void BM_AsyncConstructionTime (benchmark::State& state, MRFactoryFunc factory, size_t initial_pool_size) {
141+ void BM_AsyncConstructionTime (
142+ benchmark::State& state, MRFactoryFunc factory, size_t initial_pool_size
143+ ) {
137144 for (auto _ : state) {
138145 auto start_time = std::chrono::high_resolution_clock::now ();
139146 auto [pool, mr] = factory (initial_pool_size);
@@ -149,23 +156,145 @@ void BM_AsyncConstructionTime(benchmark::State& state, MRFactoryFunc factory, si
149156 }
150157}
151158
159+ /* *
160+ * @brief Benchmark to compare device to host copy vs device to stream-ordered pinned copy
161+ */
162+ template <typename MRFactoryFunc>
163+ void BM_DeviceToHostCopyComparison (
164+ benchmark::State& state, MRFactoryFunc factory, size_t copy_size, size_t n_copies
165+ ) {
166+ // Create memory resource for stream-ordered pinned memory
167+ auto [pool, mr] = factory (0 );
168+
169+ rmm::cuda_stream stream{rmm::cuda_stream::flags::non_blocking};
170+
171+ // Allocate device memory
172+ rmm::device_buffer device_buf (copy_size, stream);
173+ stream.synchronize ();
174+
175+ // Preallocate regular host memory buffers
176+ std::vector<std::vector<std::byte>> host_bufs;
177+ host_bufs.reserve (n_copies);
178+ for (size_t i = 0 ; i < n_copies; ++i) {
179+ host_bufs.emplace_back (copy_size);
180+ }
181+
182+ // Preallocate stream-ordered pinned memory buffers
183+ std::vector<rapidsmpf::PinnedHostBuffer> pinned_bufs;
184+ pinned_bufs.reserve (n_copies);
185+ for (size_t i = 0 ; i < n_copies; ++i) {
186+ pinned_bufs.emplace_back (copy_size, stream, mr);
187+ }
188+ stream.synchronize ();
189+
190+ for (auto _ : state) {
191+ state.PauseTiming ();
192+
193+ // Reset stream
194+ stream.synchronize ();
195+
196+ state.ResumeTiming ();
197+
198+ // Time device to regular host copies (synchronous)
199+ auto host_copy_start = std::chrono::high_resolution_clock::now ();
200+ for (size_t i = 0 ; i < n_copies; ++i) {
201+ cudaMemcpy (
202+ host_bufs[i].data (), device_buf.data (), copy_size, cudaMemcpyDefault
203+ );
204+ }
205+ auto host_copy_end = std::chrono::high_resolution_clock::now ();
206+
207+ // Time device to stream-ordered pinned copies (asynchronous)
208+ auto pinned_copy_start = std::chrono::high_resolution_clock::now ();
209+ for (size_t i = 0 ; i < n_copies; ++i) {
210+ cudaMemcpyAsync (
211+ pinned_bufs[i].data (),
212+ device_buf.data (),
213+ copy_size,
214+ cudaMemcpyDefault,
215+ stream.value ()
216+ );
217+ }
218+ stream.synchronize ();
219+ auto pinned_copy_end = std::chrono::high_resolution_clock::now ();
220+
221+ // Calculate times in nanoseconds
222+ auto host_copy_time_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
223+ host_copy_end - host_copy_start
224+ )
225+ .count ();
226+ auto pinned_copy_time_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
227+ pinned_copy_end - pinned_copy_start
228+ )
229+ .count ();
230+
231+ // Calculate bandwidth (GB/s) - total data transferred
232+ auto total_bytes = static_cast <double >(copy_size * n_copies);
233+ auto host_bandwidth_gbps =
234+ (total_bytes / 1e9 ) / (host_copy_time_ns / 1e9 ); // GB/s
235+ auto pinned_bandwidth_gbps =
236+ (total_bytes / 1e9 ) / (pinned_copy_time_ns / 1e9 ); // GB/s
237+
238+ // Set benchmark counters
239+ state.counters [" host_copy_time_ns" ] = host_copy_time_ns;
240+ state.counters [" pinned_copy_time_ns" ] = pinned_copy_time_ns;
241+ state.counters [" host_bandwidth_gbps" ] = host_bandwidth_gbps;
242+ state.counters [" pinned_bandwidth_gbps" ] = pinned_bandwidth_gbps;
243+ state.counters [" speedup" ] = static_cast <double >(host_copy_time_ns)
244+ / static_cast <double >(pinned_copy_time_ns);
245+ }
246+ }
247+
152248} // namespace
153249
154250// Register benchmarks
155- BENCHMARK_CAPTURE (
156- BM_AsyncPrimingImpact, unprimed, &make_pinned_resource, 0 , 10 << 20
157- ) // 1MB allocations
251+
252+ // 10MB allocations with no priming
253+ BENCHMARK_CAPTURE (BM_AsyncPrimingImpact, unprimed, &make_pinned_resource, 0 , 10 << 20 )
158254 ->Unit(benchmark::kMicrosecond );
159255
256+ // 10MB allocations with 1000MB priming
160257BENCHMARK_CAPTURE (
161258 BM_AsyncPrimingImpact, primed, &make_pinned_resource, 1000 << 20 , 10 << 20
162- ) // 1MB allocations with 100MB priming
259+ )
163260 ->Unit(benchmark::kMicrosecond );
164261
262+ // no priming
165263BENCHMARK_CAPTURE (BM_AsyncConstructionTime, unprimed, &make_pinned_resource, 0 )
166264 ->Unit(benchmark::kMicrosecond );
167-
265+ // 1000MB priming
168266BENCHMARK_CAPTURE (BM_AsyncConstructionTime, primed, &make_pinned_resource, 1000 << 20 )
169267 ->Unit(benchmark::kMicrosecond );
170268
171- BENCHMARK_MAIN ();
269+ // Device to Host Copy Comparison benchmarks
270+ // 1MB copy, 1 copy
271+ BENCHMARK_CAPTURE (
272+ BM_DeviceToHostCopyComparison, 1MB_1copy, &make_pinned_resource, 1 << 20 , 1
273+ )
274+ ->Unit(benchmark::kMicrosecond );
275+
276+ // 1MB copy, 10 copies
277+ BENCHMARK_CAPTURE (
278+ BM_DeviceToHostCopyComparison, 1MB_10copies, &make_pinned_resource, 1 << 20 , 10
279+ )
280+ ->Unit(benchmark::kMicrosecond );
281+
282+ // 10MB copy, 1 copy
283+ BENCHMARK_CAPTURE (
284+ BM_DeviceToHostCopyComparison, 10MB_1copy, &make_pinned_resource, 10 << 20 , 1
285+ )
286+ ->Unit(benchmark::kMicrosecond );
287+
288+ // 10MB copy, 10 copies
289+ BENCHMARK_CAPTURE (
290+ BM_DeviceToHostCopyComparison, 10MB_10copies, &make_pinned_resource, 10 << 20 , 10
291+ )
292+ ->Unit(benchmark::kMicrosecond );
293+
294+ // 100MB copy, 1 copy
295+ BENCHMARK_CAPTURE (
296+ BM_DeviceToHostCopyComparison, 100MB_1copy, &make_pinned_resource, 100 << 20 , 1
297+ )
298+ ->Unit(benchmark::kMicrosecond );
299+
300+ BENCHMARK_MAIN ();
0 commit comments