Skip to content

Commit 717e523

Browse files
committed
adding more benchmarks
Signed-off-by: niranda perera <[email protected]>
1 parent 30263cd commit 717e523

File tree

3 files changed

+138
-9
lines changed

3 files changed

+138
-9
lines changed

cpp/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ set_target_properties(
137137
)
138138
target_compile_options(
139139
bench_pinned_memory_resources PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAPIDSMPF_CXX_FLAGS}>"
140-
"$<$<COMPILE_LANGUAGE:CUDA>:${RAPIDSMPF_CUDA_FLAGS}>"
140+
"$<$<COMPILE_LANGUAGE:CUDA>:${RAPIDSMPF_CUDA_FLAGS}>"
141141
)
142142
target_link_libraries(
143143
bench_pinned_memory_resources

cpp/benchmarks/bench_pinned_memory_resources.cpp

Lines changed: 136 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,18 @@
2121
#include <vector>
2222

2323
#include <benchmark/benchmark.h>
24+
#include <cuda_runtime.h>
2425

2526
#include <rmm/cuda_stream.hpp>
2627

2728
#include <rapidsmpf/buffer/pinned_memory_resource.hpp>
2829

2930
namespace {
3031

32+
// inspired by
33+
// https://github.com/rapidsai/rmm/blob/branch-25.12/cpp/benchmarks/async_priming/async_priming_bench.cpp
34+
// benchmark
35+
3136
/**
3237
* @brief Factory function to create a cuda_async_memory_resource with priming
3338
*/
@@ -133,7 +138,9 @@ void BM_AsyncPrimingImpact(
133138
* @brief Benchmark to measure construction time with and without priming
134139
*/
135140
template <typename MRFactoryFunc>
136-
void BM_AsyncConstructionTime(benchmark::State& state, MRFactoryFunc factory, size_t initial_pool_size) {
141+
void BM_AsyncConstructionTime(
142+
benchmark::State& state, MRFactoryFunc factory, size_t initial_pool_size
143+
) {
137144
for (auto _ : state) {
138145
auto start_time = std::chrono::high_resolution_clock::now();
139146
auto [pool, mr] = factory(initial_pool_size);
@@ -149,23 +156,145 @@ void BM_AsyncConstructionTime(benchmark::State& state, MRFactoryFunc factory, si
149156
}
150157
}
151158

159+
/**
160+
* @brief Benchmark to compare device to host copy vs device to stream-ordered pinned copy
161+
*/
162+
template <typename MRFactoryFunc>
163+
void BM_DeviceToHostCopyComparison(
164+
benchmark::State& state, MRFactoryFunc factory, size_t copy_size, size_t n_copies
165+
) {
166+
// Create memory resource for stream-ordered pinned memory
167+
auto [pool, mr] = factory(0);
168+
169+
rmm::cuda_stream stream{rmm::cuda_stream::flags::non_blocking};
170+
171+
// Allocate device memory
172+
rmm::device_buffer device_buf(copy_size, stream);
173+
stream.synchronize();
174+
175+
// Preallocate regular host memory buffers
176+
std::vector<std::vector<std::byte>> host_bufs;
177+
host_bufs.reserve(n_copies);
178+
for (size_t i = 0; i < n_copies; ++i) {
179+
host_bufs.emplace_back(copy_size);
180+
}
181+
182+
// Preallocate stream-ordered pinned memory buffers
183+
std::vector<rapidsmpf::PinnedHostBuffer> pinned_bufs;
184+
pinned_bufs.reserve(n_copies);
185+
for (size_t i = 0; i < n_copies; ++i) {
186+
pinned_bufs.emplace_back(copy_size, stream, mr);
187+
}
188+
stream.synchronize();
189+
190+
for (auto _ : state) {
191+
state.PauseTiming();
192+
193+
// Reset stream
194+
stream.synchronize();
195+
196+
state.ResumeTiming();
197+
198+
// Time device to regular host copies (synchronous)
199+
auto host_copy_start = std::chrono::high_resolution_clock::now();
200+
for (size_t i = 0; i < n_copies; ++i) {
201+
cudaMemcpy(
202+
host_bufs[i].data(), device_buf.data(), copy_size, cudaMemcpyDefault
203+
);
204+
}
205+
auto host_copy_end = std::chrono::high_resolution_clock::now();
206+
207+
// Time device to stream-ordered pinned copies (asynchronous)
208+
auto pinned_copy_start = std::chrono::high_resolution_clock::now();
209+
for (size_t i = 0; i < n_copies; ++i) {
210+
cudaMemcpyAsync(
211+
pinned_bufs[i].data(),
212+
device_buf.data(),
213+
copy_size,
214+
cudaMemcpyDefault,
215+
stream.value()
216+
);
217+
}
218+
stream.synchronize();
219+
auto pinned_copy_end = std::chrono::high_resolution_clock::now();
220+
221+
// Calculate times in nanoseconds
222+
auto host_copy_time_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
223+
host_copy_end - host_copy_start
224+
)
225+
.count();
226+
auto pinned_copy_time_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
227+
pinned_copy_end - pinned_copy_start
228+
)
229+
.count();
230+
231+
// Calculate bandwidth (GB/s) - total data transferred
232+
auto total_bytes = static_cast<double>(copy_size * n_copies);
233+
auto host_bandwidth_gbps =
234+
(total_bytes / 1e9) / (host_copy_time_ns / 1e9); // GB/s
235+
auto pinned_bandwidth_gbps =
236+
(total_bytes / 1e9) / (pinned_copy_time_ns / 1e9); // GB/s
237+
238+
// Set benchmark counters
239+
state.counters["host_copy_time_ns"] = host_copy_time_ns;
240+
state.counters["pinned_copy_time_ns"] = pinned_copy_time_ns;
241+
state.counters["host_bandwidth_gbps"] = host_bandwidth_gbps;
242+
state.counters["pinned_bandwidth_gbps"] = pinned_bandwidth_gbps;
243+
state.counters["speedup"] = static_cast<double>(host_copy_time_ns)
244+
/ static_cast<double>(pinned_copy_time_ns);
245+
}
246+
}
247+
152248
} // namespace
153249

154250
// Register benchmarks
155-
BENCHMARK_CAPTURE(
156-
BM_AsyncPrimingImpact, unprimed, &make_pinned_resource, 0, 10 << 20
157-
) // 1MB allocations
251+
252+
// 10MB allocations with no priming
253+
BENCHMARK_CAPTURE(BM_AsyncPrimingImpact, unprimed, &make_pinned_resource, 0, 10 << 20)
158254
->Unit(benchmark::kMicrosecond);
159255

256+
// 10MB allocations with 1000MB priming
160257
BENCHMARK_CAPTURE(
161258
BM_AsyncPrimingImpact, primed, &make_pinned_resource, 1000 << 20, 10 << 20
162-
) // 1MB allocations with 100MB priming
259+
)
163260
->Unit(benchmark::kMicrosecond);
164261

262+
// no priming
165263
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, unprimed, &make_pinned_resource, 0)
166264
->Unit(benchmark::kMicrosecond);
167-
265+
// 1000MB priming
168266
BENCHMARK_CAPTURE(BM_AsyncConstructionTime, primed, &make_pinned_resource, 1000 << 20)
169267
->Unit(benchmark::kMicrosecond);
170268

171-
BENCHMARK_MAIN();
269+
// Device to Host Copy Comparison benchmarks
270+
// 1MB copy, 1 copy
271+
BENCHMARK_CAPTURE(
272+
BM_DeviceToHostCopyComparison, 1MB_1copy, &make_pinned_resource, 1 << 20, 1
273+
)
274+
->Unit(benchmark::kMicrosecond);
275+
276+
// 1MB copy, 10 copies
277+
BENCHMARK_CAPTURE(
278+
BM_DeviceToHostCopyComparison, 1MB_10copies, &make_pinned_resource, 1 << 20, 10
279+
)
280+
->Unit(benchmark::kMicrosecond);
281+
282+
// 10MB copy, 1 copy
283+
BENCHMARK_CAPTURE(
284+
BM_DeviceToHostCopyComparison, 10MB_1copy, &make_pinned_resource, 10 << 20, 1
285+
)
286+
->Unit(benchmark::kMicrosecond);
287+
288+
// 10MB copy, 10 copies
289+
BENCHMARK_CAPTURE(
290+
BM_DeviceToHostCopyComparison, 10MB_10copies, &make_pinned_resource, 10 << 20, 10
291+
)
292+
->Unit(benchmark::kMicrosecond);
293+
294+
// 100MB copy, 1 copy
295+
BENCHMARK_CAPTURE(
296+
BM_DeviceToHostCopyComparison, 100MB_1copy, &make_pinned_resource, 100 << 20, 1
297+
)
298+
->Unit(benchmark::kMicrosecond);
299+
300+
BENCHMARK_MAIN();

cpp/include/rapidsmpf/buffer/pinned_memory_resource.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class PinnedMemoryResource; // forward declaration
6363
* @sa https://github.com/rapidsai/rmm/issues/1931
6464
*/
6565
struct PinnedPoolProperties {
66-
size_t initial_pool_size = 0;
66+
size_t initial_pool_size = 0; ///< The initial size of the pool in bytes.
6767
};
6868

6969
/**

0 commit comments

Comments
 (0)