Skip to content

Commit 21a5014

Browse files
committed
Use pinned host memory for improved async performance
1 parent 6d0c28e commit 21a5014

File tree

6 files changed

+63
-33
lines changed

6 files changed

+63
-33
lines changed

include/cuco/detail/dynamic_map.inl

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,17 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
157157
hash,
158158
key_equal);
159159

160-
std::size_t h_num_successes;
161-
CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes,
160+
std::size_t* h_num_successes;
161+
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
162+
CUCO_CUDA_TRY(cudaMemcpyAsync(h_num_successes,
162163
submap_num_successes_[submap_idx],
163164
sizeof(atomic_ctr_type),
164165
cudaMemcpyDeviceToHost,
165166
stream));
166-
submaps_[submap_idx]->size_ += h_num_successes;
167-
size_ += h_num_successes;
167+
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
168+
submaps_[submap_idx]->size_ += *h_num_successes;
169+
size_ += *h_num_successes;
170+
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
168171
first += n;
169172
num_to_insert -= n;
170173
}
@@ -205,14 +208,17 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
205208
key_equal);
206209

207210
for (uint32_t i = 0; i < submaps_.size(); ++i) {
208-
std::size_t h_submap_num_successes;
209-
CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes,
211+
std::size_t* h_submap_num_successes;
212+
CUCO_CUDA_TRY(cudaMallocHost(&h_submap_num_successes, sizeof(std::size_t)));
213+
CUCO_CUDA_TRY(cudaMemcpyAsync(h_submap_num_successes,
210214
submap_num_successes_[i],
211215
sizeof(atomic_ctr_type),
212216
cudaMemcpyDeviceToHost,
213217
stream));
214-
submaps_[i]->size_ -= h_submap_num_successes;
215-
size_ -= h_submap_num_successes;
218+
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
219+
submaps_[i]->size_ -= *h_submap_num_successes;
220+
size_ -= *h_submap_num_successes;
221+
CUCO_CUDA_TRY(cudaFreeHost(h_submap_num_successes));
216222
}
217223
}
218224

include/cuco/detail/hyperloglog/hyperloglog_impl.cuh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -405,10 +405,10 @@ class hyperloglog_impl {
405405
[[nodiscard]] __host__ size_t estimate(cuda::stream_ref stream) const
406406
{
407407
auto const num_regs = 1ull << this->precision_;
408-
std::vector<register_type> host_sketch(num_regs);
408+
register_type* host_sketch;
409+
CUCO_CUDA_TRY(cudaMallocHost(&host_sketch, sizeof(register_type) * num_regs));
409410

410-
// TODO check if storage is host accessible
411-
CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
411+
CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch,
412412
this->sketch_.data(),
413413
sizeof(register_type) * num_regs,
414414
cudaMemcpyDefault,
@@ -419,11 +419,14 @@ class hyperloglog_impl {
419419
int zeroes = 0;
420420

421421
// geometric mean computation + count registers with 0s
422-
for (auto const reg : host_sketch) {
422+
for (size_t i = 0; i < num_regs; i++) {
423+
auto const reg = host_sketch[i];
423424
sum += fp_type{1} / static_cast<fp_type>(1ull << reg);
424425
zeroes += reg == 0;
425426
}
426427

428+
CUCO_CUDA_TRY(cudaFreeHost(host_sketch));
429+
427430
auto const finalize = cuco::hyperloglog_ns::detail::finalizer(this->precision_);
428431

429432
// pass intermediate result to finalizer for bias correction, etc.

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -876,11 +876,13 @@ class open_addressing_impl {
876876
is_filled,
877877
stream.get()));
878878

879-
size_type temp_count;
879+
size_type* temp_count;
880+
CUCO_CUDA_TRY(cudaMallocHost(&temp_count, sizeof(size_type)));
880881
CUCO_CUDA_TRY(cudaMemcpyAsync(
881-
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
882+
temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
882883
stream.wait();
883-
h_num_out += temp_count;
884+
h_num_out += *temp_count;
885+
CUCO_CUDA_TRY(cudaFreeHost(temp_count));
884886
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
885887
}
886888

include/cuco/detail/static_map.inl

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,19 @@ void static_map<Key, Value, Scope, Allocator>::insert(
104104
// TODO: memset an atomic variable is unsafe
105105
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
106106
CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
107-
std::size_t h_num_successes;
107+
108+
std::size_t* h_num_successes;
109+
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
108110

109111
detail::insert<block_size, tile_size>
110112
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
111113
CUCO_CUDA_TRY(cudaMemcpyAsync(
112-
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
114+
h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
113115

114116
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
115117

116-
size_ += h_num_successes;
118+
size_ += *h_num_successes;
119+
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
117120
}
118121

119122
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -142,15 +145,18 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
142145
// TODO: memset an atomic variable is unsafe
143146
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
144147
CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
145-
std::size_t h_num_successes;
148+
149+
std::size_t* h_num_successes;
150+
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
146151

147152
detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
148153
first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
149154
CUCO_CUDA_TRY(cudaMemcpyAsync(
150-
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
155+
h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
151156
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
152157

153-
size_ += h_num_successes;
158+
size_ += *h_num_successes;
159+
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
154160
}
155161

156162
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -174,16 +180,19 @@ void static_map<Key, Value, Scope, Allocator>::erase(
174180
// TODO: memset an atomic variable is unsafe
175181
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
176182
CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
177-
std::size_t h_num_successes;
183+
184+
std::size_t* h_num_successes;
185+
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
178186

179187
detail::erase<block_size, tile_size>
180188
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
181189
CUCO_CUDA_TRY(cudaMemcpyAsync(
182-
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
190+
h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
183191

184192
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
185193

186-
size_ -= h_num_successes;
194+
size_ -= *h_num_successes;
195+
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
187196
}
188197

189198
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -249,16 +258,21 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
249258
filled,
250259
stream);
251260

252-
std::size_t h_num_out;
261+
std::size_t* h_num_out;
262+
CUCO_CUDA_TRY(cudaMallocHost(&h_num_out, sizeof(std::size_t)));
253263
CUCO_CUDA_TRY(
254-
cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
264+
cudaMemcpyAsync(h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
255265
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
266+
267+
auto result = std::make_pair(keys_out + *h_num_out, values_out + *h_num_out);
268+
269+
CUCO_CUDA_TRY(cudaFreeHost(h_num_out));
256270
std::allocator_traits<temp_allocator_type>::deallocate(
257271
temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(std::size_t));
258272
std::allocator_traits<temp_allocator_type>::deallocate(
259273
temp_allocator, d_temp_storage, temp_storage_bytes);
260274

261-
return std::make_pair(keys_out + h_num_out, values_out + h_num_out);
275+
return result;
262276
}
263277

264278
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>

include/cuco/detail/storage/counter_storage.cuh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,14 @@ class counter_storage : public storage_base<cuco::extent<SizeType, 1>> {
9292
*/
9393
[[nodiscard]] constexpr size_type load_to_host(cuda::stream_ref stream) const
9494
{
95-
size_type h_count;
95+
size_type* h_count;
96+
CUCO_CUDA_TRY(cudaMallocHost(&h_count, sizeof(size_type)));
9697
CUCO_CUDA_TRY(cudaMemcpyAsync(
97-
&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
98+
h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
9899
stream.wait();
99-
return h_count;
100+
size_type result = *h_count;
101+
CUCO_CUDA_TRY(cudaFreeHost(h_count));
102+
return result;
100103
}
101104

102105
private:

include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,15 +208,17 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
208208
num_blocks,
209209
stream.get()));
210210

211-
size_type num_selects{};
211+
size_type* h_num_selects;
212+
CUCO_CUDA_TRY(cudaMallocHost(&h_num_selects, sizeof(size_type)));
212213
CUCO_CUDA_TRY(
213-
cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
214+
cudaMemcpyAsync(h_num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
214215
stream.wait();
215216
std::allocator_traits<temp_allocator_type>::deallocate(
216217
temp_allocator, thrust::device_ptr<char>{reinterpret_cast<char*>(d_sum)}, sizeof(size_type));
217218
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
218219

219-
selects.resize(num_selects);
220+
selects.resize(*h_num_selects);
221+
CUCO_CUDA_TRY(cudaFreeHost(h_num_selects));
220222

221223
auto const select_begin = thrust::raw_pointer_cast(selects.data());
222224

0 commit comments

Comments
 (0)