2626#include < rmm/cuda_stream.hpp>
2727
2828#include < rapidsmpf/buffer/pinned_memory_resource.hpp>
29+ #include < rapidsmpf/error.hpp>
2930
3031namespace {
3132
@@ -198,26 +199,30 @@ void BM_DeviceToHostCopyComparison(
198199 // Time device to regular host copies (synchronous)
199200 auto host_copy_start = std::chrono::high_resolution_clock::now ();
200201 for (size_t i = 0 ; i < n_copies; ++i) {
201- cudaMemcpy (
202+ RAPIDSMPF_CUDA_TRY ( cudaMemcpy (
202203 host_bufs[i].data (), device_buf.data (), copy_size, cudaMemcpyDefault
203- );
204+ )) ;
204205 }
205206 auto host_copy_end = std::chrono::high_resolution_clock::now ();
206207
207208 // Time device to stream-ordered pinned copies (asynchronous)
208209 auto pinned_copy_start = std::chrono::high_resolution_clock::now ();
209210 for (size_t i = 0 ; i < n_copies; ++i) {
210- cudaMemcpyAsync (
211+ RAPIDSMPF_CUDA_TRY ( cudaMemcpyAsync (
211212 pinned_bufs[i].data (),
212213 device_buf.data (),
213214 copy_size,
214215 cudaMemcpyDefault,
215216 stream.value ()
216- );
217+ )) ;
217218 }
218219 stream.synchronize ();
219220 auto pinned_copy_end = std::chrono::high_resolution_clock::now ();
220221
222+ benchmark::DoNotOptimize (device_buf);
223+ benchmark::DoNotOptimize (host_bufs);
224+ benchmark::DoNotOptimize (pinned_bufs);
225+
221226 // Calculate times in nanoseconds
222227 auto host_copy_time_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
223228 host_copy_end - host_copy_start
@@ -228,6 +233,7 @@ void BM_DeviceToHostCopyComparison(
228233 )
229234 .count ();
230235
236+
231237 // Calculate bandwidth (GB/s) - total data transferred
232238 auto total_bytes = static_cast <double >(copy_size * n_copies);
233239 auto host_bandwidth_gbps =
@@ -262,39 +268,53 @@ BENCHMARK_CAPTURE(
262268// no priming
263269BENCHMARK_CAPTURE (BM_AsyncConstructionTime, unprimed, &make_pinned_resource, 0 )
264270 ->Unit(benchmark::kMicrosecond );
265- // 1000MB priming
266- BENCHMARK_CAPTURE (BM_AsyncConstructionTime, primed , &make_pinned_resource, 1000 << 20 )
271+ // 1GB priming
272+ BENCHMARK_CAPTURE (BM_AsyncConstructionTime, primed_1GB , &make_pinned_resource, 1 << 30 )
267273 ->Unit(benchmark::kMicrosecond );
268-
269- // Device to Host Copy Comparison benchmarks
270- // 1MB copy, 1 copy
271- BENCHMARK_CAPTURE (
272- BM_DeviceToHostCopyComparison, 1MB_1copy, &make_pinned_resource, 1 << 20 , 1
273- )
274+ // 4GB priming
275+ BENCHMARK_CAPTURE (BM_AsyncConstructionTime, primed_4GB, &make_pinned_resource, 4 << 30 )
274276 ->Unit(benchmark::kMicrosecond );
275277
276- // 1MB copy, 10 copies
277- BENCHMARK_CAPTURE (
278- BM_DeviceToHostCopyComparison, 1MB_10copies, &make_pinned_resource, 1 << 20 , 10
279- )
280- ->Unit(benchmark::kMicrosecond );
281-
282- // 10MB copy, 1 copy
283- BENCHMARK_CAPTURE (
284- BM_DeviceToHostCopyComparison, 10MB_1copy, &make_pinned_resource, 10 << 20 , 1
285- )
286- ->Unit(benchmark::kMicrosecond );
287-
288- // 10MB copy, 10 copies
289- BENCHMARK_CAPTURE (
290- BM_DeviceToHostCopyComparison, 10MB_10copies, &make_pinned_resource, 10 << 20 , 10
291- )
292- ->Unit(benchmark::kMicrosecond );
278+ // Device to Host Copy Comparison benchmarks - registered in a loop
279+ static auto register_device_to_host_copy_benchmarks = [] {
280+ struct BenchConfig {
281+ size_t copy_size_mb;
282+ size_t n_copies;
283+ };
284+
285+ constexpr std::array<BenchConfig, 12 > configs = {{
286+ {.copy_size_mb = 1 , .n_copies = 1 },
287+ {.copy_size_mb = 1 , .n_copies = 10 },
288+ {.copy_size_mb = 1 , .n_copies = 100 },
289+ {.copy_size_mb = 4 , .n_copies = 1 },
290+ {.copy_size_mb = 4 , .n_copies = 10 },
291+ {.copy_size_mb = 4 , .n_copies = 100 },
292+ {.copy_size_mb = 10 , .n_copies = 1 },
293+ {.copy_size_mb = 10 , .n_copies = 10 },
294+ {.copy_size_mb = 10 , .n_copies = 100 },
295+ {.copy_size_mb = 100 , .n_copies = 1 },
296+ {.copy_size_mb = 100 , .n_copies = 10 },
297+ {.copy_size_mb = 100 , .n_copies = 100 },
298+ }};
299+
300+ for (const auto & config : configs) {
301+ std::string name = " BM_DeviceToHostCopyComparison/"
302+ + std::to_string (config.copy_size_mb ) + " MB_"
303+ + std::to_string (config.n_copies ) + " copies" ;
304+
305+ benchmark::RegisterBenchmark (
306+ name.c_str (),
307+ [](benchmark::State& state, auto factory, size_t copy_size, size_t n_copies) {
308+ BM_DeviceToHostCopyComparison (state, factory, copy_size, n_copies);
309+ },
310+ &make_pinned_resource,
311+ config.copy_size_mb << 20 ,
312+ config.n_copies
313+ )
314+ ->Unit (benchmark::kMicrosecond );
315+ }
293316
294- // 100MB copy, 1 copy
295- BENCHMARK_CAPTURE (
296- BM_DeviceToHostCopyComparison, 100MB_1copy, &make_pinned_resource, 100 << 20 , 1
297- )
298- ->Unit(benchmark::kMicrosecond );
317+ return 0 ;
318+ }();
299319
300320BENCHMARK_MAIN ();
0 commit comments