@@ -104,19 +104,16 @@ void static_map<Key, Value, Scope, Allocator>::insert(
104104 // TODO: memset an atomic variable is unsafe
105105 static_assert (sizeof (std::size_t ) == sizeof (atomic_ctr_type));
106106 CUCO_CUDA_TRY (cudaMemsetAsync (num_successes_, 0 , sizeof (atomic_ctr_type), stream));
107-
108- std::size_t * h_num_successes;
109- CUCO_CUDA_TRY (cudaMallocHost (&h_num_successes, sizeof (std::size_t )));
107+ std::size_t h_num_successes;
110108
111109 detail::insert<block_size, tile_size>
112110 <<<grid_size, block_size, 0 , stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
113111 CUCO_CUDA_TRY (cudaMemcpyAsync (
114- h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
112+ & h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
115113
116114 CUCO_CUDA_TRY (cudaStreamSynchronize (stream)); // stream sync to ensure h_num_successes is updated
117115
118- size_ += *h_num_successes;
119- CUCO_CUDA_TRY (cudaFreeHost (h_num_successes));
116+ size_ += h_num_successes;
120117}
121118
122119template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -145,18 +142,15 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
145142 // TODO: memset an atomic variable is unsafe
146143 static_assert (sizeof (std::size_t ) == sizeof (atomic_ctr_type));
147144 CUCO_CUDA_TRY (cudaMemsetAsync (num_successes_, 0 , sizeof (atomic_ctr_type), stream));
148-
149- std::size_t * h_num_successes;
150- CUCO_CUDA_TRY (cudaMallocHost (&h_num_successes, sizeof (std::size_t )));
145+ std::size_t h_num_successes;
151146
152147 detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0 , stream>>>(
153148 first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
154149 CUCO_CUDA_TRY (cudaMemcpyAsync (
155- h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
150+ & h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
156151 CUCO_CUDA_TRY (cudaStreamSynchronize (stream));
157152
158- size_ += *h_num_successes;
159- CUCO_CUDA_TRY (cudaFreeHost (h_num_successes));
153+ size_ += h_num_successes;
160154}
161155
162156template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -180,19 +174,16 @@ void static_map<Key, Value, Scope, Allocator>::erase(
180174 // TODO: memset an atomic variable is unsafe
181175 static_assert (sizeof (std::size_t ) == sizeof (atomic_ctr_type));
182176 CUCO_CUDA_TRY (cudaMemsetAsync (num_successes_, 0 , sizeof (atomic_ctr_type), stream));
183-
184- std::size_t * h_num_successes;
185- CUCO_CUDA_TRY (cudaMallocHost (&h_num_successes, sizeof (std::size_t )));
177+ std::size_t h_num_successes;
186178
187179 detail::erase<block_size, tile_size>
188180 <<<grid_size, block_size, 0 , stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
189181 CUCO_CUDA_TRY (cudaMemcpyAsync (
190- h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
182+ & h_num_successes, num_successes_, sizeof (atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
191183
192184 CUCO_CUDA_TRY (cudaStreamSynchronize (stream)); // stream sync to ensure h_num_successes is updated
193185
194- size_ -= *h_num_successes;
195- CUCO_CUDA_TRY (cudaFreeHost (h_num_successes));
186+ size_ -= h_num_successes;
196187}
197188
198189template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -258,21 +249,16 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
258249 filled,
259250 stream);
260251
261- std::size_t * h_num_out;
262- CUCO_CUDA_TRY (cudaMallocHost (&h_num_out, sizeof (std::size_t )));
252+ std::size_t h_num_out;
263253 CUCO_CUDA_TRY (
264- cudaMemcpyAsync (h_num_out, d_num_out, sizeof (std::size_t ), cudaMemcpyDeviceToHost, stream));
254+ cudaMemcpyAsync (& h_num_out, d_num_out, sizeof (std::size_t ), cudaMemcpyDeviceToHost, stream));
265255 CUCO_CUDA_TRY (cudaStreamSynchronize (stream));
266-
267- auto result = std::make_pair (keys_out + *h_num_out, values_out + *h_num_out);
268-
269- CUCO_CUDA_TRY (cudaFreeHost (h_num_out));
270256 std::allocator_traits<temp_allocator_type>::deallocate (
271257 temp_allocator, reinterpret_cast <char *>(d_num_out), sizeof (std::size_t ));
272258 std::allocator_traits<temp_allocator_type>::deallocate (
273259 temp_allocator, d_temp_storage, temp_storage_bytes);
274260
275- return result ;
261+ return std::make_pair (keys_out + h_num_out, values_out + h_num_out) ;
276262}
277263
278264template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
0 commit comments