Skip to content

Commit a5888e0

Browse files
authored
Revert "Use pinned host memory for improved async performance" (#739)
Reverts #727 as it introduces performance regressions in several places.
1 parent 9115ffa commit a5888e0

File tree

6 files changed

+33
-63
lines changed

6 files changed

+33
-63
lines changed

include/cuco/detail/dynamic_map.inl

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -157,17 +157,14 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
157157
hash,
158158
key_equal);
159159

160-
std::size_t* h_num_successes;
161-
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
162-
CUCO_CUDA_TRY(cudaMemcpyAsync(h_num_successes,
160+
std::size_t h_num_successes;
161+
CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes,
163162
submap_num_successes_[submap_idx],
164163
sizeof(atomic_ctr_type),
165164
cudaMemcpyDeviceToHost,
166165
stream));
167-
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
168-
submaps_[submap_idx]->size_ += *h_num_successes;
169-
size_ += *h_num_successes;
170-
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
166+
submaps_[submap_idx]->size_ += h_num_successes;
167+
size_ += h_num_successes;
171168
first += n;
172169
num_to_insert -= n;
173170
}
@@ -208,17 +205,14 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
208205
key_equal);
209206

210207
for (uint32_t i = 0; i < submaps_.size(); ++i) {
211-
std::size_t* h_submap_num_successes;
212-
CUCO_CUDA_TRY(cudaMallocHost(&h_submap_num_successes, sizeof(std::size_t)));
213-
CUCO_CUDA_TRY(cudaMemcpyAsync(h_submap_num_successes,
208+
std::size_t h_submap_num_successes;
209+
CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes,
214210
submap_num_successes_[i],
215211
sizeof(atomic_ctr_type),
216212
cudaMemcpyDeviceToHost,
217213
stream));
218-
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
219-
submaps_[i]->size_ -= *h_submap_num_successes;
220-
size_ -= *h_submap_num_successes;
221-
CUCO_CUDA_TRY(cudaFreeHost(h_submap_num_successes));
214+
submaps_[i]->size_ -= h_submap_num_successes;
215+
size_ -= h_submap_num_successes;
222216
}
223217
}
224218

include/cuco/detail/hyperloglog/hyperloglog_impl.cuh

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -405,10 +405,10 @@ class hyperloglog_impl {
405405
[[nodiscard]] __host__ size_t estimate(cuda::stream_ref stream) const
406406
{
407407
auto const num_regs = 1ull << this->precision_;
408-
register_type* host_sketch;
409-
CUCO_CUDA_TRY(cudaMallocHost(&host_sketch, sizeof(register_type) * num_regs));
408+
std::vector<register_type> host_sketch(num_regs);
410409

411-
CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch,
410+
// TODO check if storage is host accessible
411+
CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
412412
this->sketch_.data(),
413413
sizeof(register_type) * num_regs,
414414
cudaMemcpyDefault,
@@ -419,14 +419,11 @@ class hyperloglog_impl {
419419
int zeroes = 0;
420420

421421
// geometric mean computation + count registers with 0s
422-
for (size_t i = 0; i < num_regs; i++) {
423-
auto const reg = host_sketch[i];
422+
for (auto const reg : host_sketch) {
424423
sum += fp_type{1} / static_cast<fp_type>(1ull << reg);
425424
zeroes += reg == 0;
426425
}
427426

428-
CUCO_CUDA_TRY(cudaFreeHost(host_sketch));
429-
430427
auto const finalize = cuco::hyperloglog_ns::detail::finalizer(this->precision_);
431428

432429
// pass intermediate result to finalizer for bias correction, etc.

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -876,13 +876,11 @@ class open_addressing_impl {
876876
is_filled,
877877
stream.get()));
878878

879-
size_type* temp_count;
880-
CUCO_CUDA_TRY(cudaMallocHost(&temp_count, sizeof(size_type)));
879+
size_type temp_count;
881880
CUCO_CUDA_TRY(cudaMemcpyAsync(
882-
temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
881+
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
883882
stream.wait();
884-
h_num_out += *temp_count;
885-
CUCO_CUDA_TRY(cudaFreeHost(temp_count));
883+
h_num_out += temp_count;
886884
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
887885
}
888886

include/cuco/detail/static_map.inl

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -104,19 +104,16 @@ void static_map<Key, Value, Scope, Allocator>::insert(
104104
// TODO: memset an atomic variable is unsafe
105105
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
106106
CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
107-
108-
std::size_t* h_num_successes;
109-
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
107+
std::size_t h_num_successes;
110108

111109
detail::insert<block_size, tile_size>
112110
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
113111
CUCO_CUDA_TRY(cudaMemcpyAsync(
114-
h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
112+
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
115113

116114
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
117115

118-
size_ += *h_num_successes;
119-
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
116+
size_ += h_num_successes;
120117
}
121118

122119
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -145,18 +142,15 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
145142
// TODO: memset an atomic variable is unsafe
146143
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
147144
CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
148-
149-
std::size_t* h_num_successes;
150-
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
145+
std::size_t h_num_successes;
151146

152147
detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
153148
first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
154149
CUCO_CUDA_TRY(cudaMemcpyAsync(
155-
h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
150+
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
156151
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
157152

158-
size_ += *h_num_successes;
159-
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
153+
size_ += h_num_successes;
160154
}
161155

162156
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -180,19 +174,16 @@ void static_map<Key, Value, Scope, Allocator>::erase(
180174
// TODO: memset an atomic variable is unsafe
181175
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
182176
CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
183-
184-
std::size_t* h_num_successes;
185-
CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(std::size_t)));
177+
std::size_t h_num_successes;
186178

187179
detail::erase<block_size, tile_size>
188180
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
189181
CUCO_CUDA_TRY(cudaMemcpyAsync(
190-
h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
182+
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
191183

192184
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
193185

194-
size_ -= *h_num_successes;
195-
CUCO_CUDA_TRY(cudaFreeHost(h_num_successes));
186+
size_ -= h_num_successes;
196187
}
197188

198189
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -258,21 +249,16 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
258249
filled,
259250
stream);
260251

261-
std::size_t* h_num_out;
262-
CUCO_CUDA_TRY(cudaMallocHost(&h_num_out, sizeof(std::size_t)));
252+
std::size_t h_num_out;
263253
CUCO_CUDA_TRY(
264-
cudaMemcpyAsync(h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
254+
cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
265255
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
266-
267-
auto result = std::make_pair(keys_out + *h_num_out, values_out + *h_num_out);
268-
269-
CUCO_CUDA_TRY(cudaFreeHost(h_num_out));
270256
std::allocator_traits<temp_allocator_type>::deallocate(
271257
temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(std::size_t));
272258
std::allocator_traits<temp_allocator_type>::deallocate(
273259
temp_allocator, d_temp_storage, temp_storage_bytes);
274260

275-
return result;
261+
return std::make_pair(keys_out + h_num_out, values_out + h_num_out);
276262
}
277263

278264
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>

include/cuco/detail/storage/counter_storage.cuh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,14 +92,11 @@ class counter_storage : public storage_base<cuco::extent<SizeType, 1>> {
9292
*/
9393
[[nodiscard]] constexpr size_type load_to_host(cuda::stream_ref stream) const
9494
{
95-
size_type* h_count;
96-
CUCO_CUDA_TRY(cudaMallocHost(&h_count, sizeof(size_type)));
95+
size_type h_count;
9796
CUCO_CUDA_TRY(cudaMemcpyAsync(
98-
h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
97+
&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
9998
stream.wait();
100-
size_type result = *h_count;
101-
CUCO_CUDA_TRY(cudaFreeHost(h_count));
102-
return result;
99+
return h_count;
103100
}
104101

105102
private:

include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,17 +209,15 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
209209
num_blocks,
210210
stream.get()));
211211

212-
size_type* h_num_selects;
213-
CUCO_CUDA_TRY(cudaMallocHost(&h_num_selects, sizeof(size_type)));
212+
size_type num_selects{};
214213
CUCO_CUDA_TRY(
215-
cudaMemcpyAsync(h_num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
214+
cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
216215
stream.wait();
217216
std::allocator_traits<temp_allocator_type>::deallocate(
218217
temp_allocator, thrust::device_ptr<char>{reinterpret_cast<char*>(d_sum)}, sizeof(size_type));
219218
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
220219

221-
selects.resize(*h_num_selects);
222-
CUCO_CUDA_TRY(cudaFreeHost(h_num_selects));
220+
selects.resize(num_selects);
223221

224222
auto const select_begin = thrust::raw_pointer_cast(selects.data());
225223

0 commit comments

Comments
 (0)