Skip to content

Commit ddc5452

Browse files
committed
more benchmarks
Signed-off-by: niranda perera <[email protected]>
1 parent 717e523 commit ddc5452

File tree

1 file changed

+54
-34
lines changed

1 file changed

+54
-34
lines changed

cpp/benchmarks/bench_pinned_memory_resources.cpp

Lines changed: 54 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <rmm/cuda_stream.hpp>
2727

2828
#include <rapidsmpf/buffer/pinned_memory_resource.hpp>
29+
#include <rapidsmpf/error.hpp>
2930

3031
namespace {
3132

@@ -198,26 +199,30 @@ void BM_DeviceToHostCopyComparison(
198199
// Time device to regular host copies (synchronous)
199200
auto host_copy_start = std::chrono::high_resolution_clock::now();
200201
for (size_t i = 0; i < n_copies; ++i) {
201-
cudaMemcpy(
202+
RAPIDSMPF_CUDA_TRY(cudaMemcpy(
202203
host_bufs[i].data(), device_buf.data(), copy_size, cudaMemcpyDefault
203-
);
204+
));
204205
}
205206
auto host_copy_end = std::chrono::high_resolution_clock::now();
206207

207208
// Time device to stream-ordered pinned copies (asynchronous)
208209
auto pinned_copy_start = std::chrono::high_resolution_clock::now();
209210
for (size_t i = 0; i < n_copies; ++i) {
210-
cudaMemcpyAsync(
211+
RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
211212
pinned_bufs[i].data(),
212213
device_buf.data(),
213214
copy_size,
214215
cudaMemcpyDefault,
215216
stream.value()
216-
);
217+
));
217218
}
218219
stream.synchronize();
219220
auto pinned_copy_end = std::chrono::high_resolution_clock::now();
220221

222+
benchmark::DoNotOptimize(device_buf);
223+
benchmark::DoNotOptimize(host_bufs);
224+
benchmark::DoNotOptimize(pinned_bufs);
225+
221226
// Calculate times in nanoseconds
222227
auto host_copy_time_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
223228
host_copy_end - host_copy_start
@@ -228,6 +233,7 @@ void BM_DeviceToHostCopyComparison(
228233
)
229234
.count();
230235

236+
231237
// Calculate bandwidth (GB/s) - total data transferred
232238
auto total_bytes = static_cast<double>(copy_size * n_copies);
233239
auto host_bandwidth_gbps =
@@ -262,39 +268,53 @@ BENCHMARK_CAPTURE(
262268
// no priming
263269
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, unprimed, &make_pinned_resource, 0)
264270
->Unit(benchmark::kMicrosecond);
265-
// 1000MB priming
266-
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, primed, &make_pinned_resource, 1000 << 20)
271+
// 1GB priming
272+
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, primed_1GB, &make_pinned_resource, 1 << 30)
267273
->Unit(benchmark::kMicrosecond);
268-
269-
// Device to Host Copy Comparison benchmarks
270-
// 1MB copy, 1 copy
271-
BENCHMARK_CAPTURE(
272-
BM_DeviceToHostCopyComparison, 1MB_1copy, &make_pinned_resource, 1 << 20, 1
273-
)
274+
// 4GB priming
275+
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, primed_4GB, &make_pinned_resource, 4 << 30)
274276
->Unit(benchmark::kMicrosecond);
275277

276-
// 1MB copy, 10 copies
277-
BENCHMARK_CAPTURE(
278-
BM_DeviceToHostCopyComparison, 1MB_10copies, &make_pinned_resource, 1 << 20, 10
279-
)
280-
->Unit(benchmark::kMicrosecond);
281-
282-
// 10MB copy, 1 copy
283-
BENCHMARK_CAPTURE(
284-
BM_DeviceToHostCopyComparison, 10MB_1copy, &make_pinned_resource, 10 << 20, 1
285-
)
286-
->Unit(benchmark::kMicrosecond);
287-
288-
// 10MB copy, 10 copies
289-
BENCHMARK_CAPTURE(
290-
BM_DeviceToHostCopyComparison, 10MB_10copies, &make_pinned_resource, 10 << 20, 10
291-
)
292-
->Unit(benchmark::kMicrosecond);
278+
// Device to Host Copy Comparison benchmarks - registered in a loop
279+
static auto register_device_to_host_copy_benchmarks = [] {
280+
struct BenchConfig {
281+
size_t copy_size_mb;
282+
size_t n_copies;
283+
};
284+
285+
constexpr std::array<BenchConfig, 12> configs = {{
286+
{.copy_size_mb = 1, .n_copies = 1},
287+
{.copy_size_mb = 1, .n_copies = 10},
288+
{.copy_size_mb = 1, .n_copies = 100},
289+
{.copy_size_mb = 4, .n_copies = 1},
290+
{.copy_size_mb = 4, .n_copies = 10},
291+
{.copy_size_mb = 4, .n_copies = 100},
292+
{.copy_size_mb = 10, .n_copies = 1},
293+
{.copy_size_mb = 10, .n_copies = 10},
294+
{.copy_size_mb = 10, .n_copies = 100},
295+
{.copy_size_mb = 100, .n_copies = 1},
296+
{.copy_size_mb = 100, .n_copies = 10},
297+
{.copy_size_mb = 100, .n_copies = 100},
298+
}};
299+
300+
for (const auto& config : configs) {
301+
std::string name = "BM_DeviceToHostCopyComparison/"
302+
+ std::to_string(config.copy_size_mb) + "MB_"
303+
+ std::to_string(config.n_copies) + "copies";
304+
305+
benchmark::RegisterBenchmark(
306+
name.c_str(),
307+
[](benchmark::State& state, auto factory, size_t copy_size, size_t n_copies) {
308+
BM_DeviceToHostCopyComparison(state, factory, copy_size, n_copies);
309+
},
310+
&make_pinned_resource,
311+
config.copy_size_mb << 20,
312+
config.n_copies
313+
)
314+
->Unit(benchmark::kMicrosecond);
315+
}
293316

294-
// 100MB copy, 1 copy
295-
BENCHMARK_CAPTURE(
296-
BM_DeviceToHostCopyComparison, 100MB_1copy, &make_pinned_resource, 100 << 20, 1
297-
)
298-
->Unit(benchmark::kMicrosecond);
317+
return 0;
318+
}();
299319

300320
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)