@@ -104,16 +104,19 @@ void static_map<Key, Value, Scope, Allocator>::insert(
104104 // TODO: memset an atomic variable is unsafe
105105 static_assert (sizeof (std::size_t ) == sizeof (atomic_ctr_type));
106106 CUCO_CUDA_TRY (cudaMemsetAsync (num_successes_, 0 , sizeof (atomic_ctr_type), stream));
107- std::size_t h_num_successes;
107+
108+ std::size_t * h_num_successes;
109+ CUCO_CUDA_TRY (cudaMallocHost (&h_num_successes, sizeof (std::size_t )));
108110
109111 detail::insert<block_size, tile_size>
110112 <<<grid_size, block_size, 0 , stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
111113 CUCO_CUDA_TRY (cudaMemcpyAsync (
112- & h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
114+ h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
113115
114116 CUCO_CUDA_TRY (cudaStreamSynchronize (stream)); // stream sync to ensure h_num_successes is updated
115117
116- size_ += h_num_successes;
118+ size_ += *h_num_successes;
119+ CUCO_CUDA_TRY (cudaFreeHost (h_num_successes));
117120}
118121
119122template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -142,15 +145,18 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
142145 // TODO: memset an atomic variable is unsafe
143146 static_assert (sizeof (std::size_t ) == sizeof (atomic_ctr_type));
144147 CUCO_CUDA_TRY (cudaMemsetAsync (num_successes_, 0 , sizeof (atomic_ctr_type), stream));
145- std::size_t h_num_successes;
148+
149+ std::size_t * h_num_successes;
150+ CUCO_CUDA_TRY (cudaMallocHost (&h_num_successes, sizeof (std::size_t )));
146151
147152 detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0 , stream>>>(
148153 first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
149154 CUCO_CUDA_TRY (cudaMemcpyAsync (
150- & h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
155+ h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
151156 CUCO_CUDA_TRY (cudaStreamSynchronize (stream));
152157
153- size_ += h_num_successes;
158+ size_ += *h_num_successes;
159+ CUCO_CUDA_TRY (cudaFreeHost (h_num_successes));
154160}
155161
156162template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -174,16 +180,19 @@ void static_map<Key, Value, Scope, Allocator>::erase(
174180 // TODO: memset an atomic variable is unsafe
175181 static_assert (sizeof (std::size_t ) == sizeof (atomic_ctr_type));
176182 CUCO_CUDA_TRY (cudaMemsetAsync (num_successes_, 0 , sizeof (atomic_ctr_type), stream));
177- std::size_t h_num_successes;
183+
184+ std::size_t * h_num_successes;
185+ CUCO_CUDA_TRY (cudaMallocHost (&h_num_successes, sizeof (std::size_t )));
178186
179187 detail::erase<block_size, tile_size>
180188 <<<grid_size, block_size, 0 , stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
181189 CUCO_CUDA_TRY (cudaMemcpyAsync (
182- & h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
190+ h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
183191
184192 CUCO_CUDA_TRY (cudaStreamSynchronize (stream)); // stream sync to ensure h_num_successes is updated
185193
186- size_ -= h_num_successes;
194+ size_ -= *h_num_successes;
195+ CUCO_CUDA_TRY (cudaFreeHost (h_num_successes));
187196}
188197
189198template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -249,16 +258,21 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
249258 filled,
250259 stream);
251260
252- std::size_t h_num_out;
261+ std::size_t * h_num_out;
262+ CUCO_CUDA_TRY (cudaMallocHost (&h_num_out, sizeof (std::size_t )));
253263 CUCO_CUDA_TRY (
254- cudaMemcpyAsync (& h_num_out, d_num_out, sizeof (std::size_t ), cudaMemcpyDeviceToHost, stream));
264+ cudaMemcpyAsync (h_num_out, d_num_out, sizeof (std::size_t ), cudaMemcpyDeviceToHost, stream));
255265 CUCO_CUDA_TRY (cudaStreamSynchronize (stream));
266+
267+ auto result = std::make_pair (keys_out + *h_num_out, values_out + *h_num_out);
268+
269+ CUCO_CUDA_TRY (cudaFreeHost (h_num_out));
256270 std::allocator_traits<temp_allocator_type>::deallocate (
257271 temp_allocator, reinterpret_cast <char *>(d_num_out), sizeof (std::size_t ));
258272 std::allocator_traits<temp_allocator_type>::deallocate (
259273 temp_allocator, d_temp_storage, temp_storage_bytes);
260274
261- return std::make_pair (keys_out + h_num_out, values_out + h_num_out) ;
275+ return result ;
262276}
263277
264278template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
0 commit comments