Skip to content

Commit c561327

Browse files
committed
Merge remote-tracking branch 'upstream/dev' into header-test
2 parents 548630e + 420d502 commit c561327

File tree

25 files changed

+446
-145
lines changed

25 files changed

+446
-145
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Large diffs are not rendered by default.

examples/static_set/device_subsets_example.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,8 @@ int main()
159159
auto const total_num_slots = offsets.back();
160160

161161
// Create a single bulk storage used by all subsets
162-
auto set_storage = storage_type{total_num_slots};
162+
auto set_storage = storage_type{
163+
total_num_slots, cuco::cuda_allocator<key_type>{}, cuda::stream_ref{cudaStream_t{nullptr}}};
163164
// Initializes the storage with the given sentinel
164165
set_storage.initialize(empty_key_sentinel);
165166

include/cuco/bucket_storage.cuh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,12 @@ class bucket_storage {
164164
*
165165
* @param size Number of slots to (de)allocate
166166
* @param allocator Allocator used for (de)allocating device storage
167+
* @param stream Stream to use for (de)allocating device storage
167168
*/
168-
explicit constexpr bucket_storage(Extent size, Allocator const& allocator = {});
169+
explicit constexpr bucket_storage(Extent size,
170+
Allocator const& allocator,
171+
cuda::stream_ref stream = cuda::stream_ref{
172+
cudaStream_t{nullptr}});
169173

170174
bucket_storage(bucket_storage&&) = default; ///< Move constructor
171175
/**

include/cuco/detail/bloom_filter/bloom_filter.inl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ __host__ constexpr bloom_filter<Key, Extent, Scope, Policy, Allocator>::bloom_fi
3434
Allocator const& alloc,
3535
cuda::stream_ref stream)
3636
: allocator_{alloc},
37-
data_{allocator_.allocate(num_blocks),
38-
detail::custom_deleter<std::size_t, allocator_type>{num_blocks, allocator_}},
37+
data_{allocator_.allocate(num_blocks, stream),
38+
detail::custom_deleter<std::size_t, allocator_type>{num_blocks, allocator_, stream}},
3939
ref_{data_.get(), num_blocks, {}, policy}
4040
{
4141
this->clear_async(stream);

include/cuco/detail/dynamic_map.inl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,8 @@ std::pair<KeyOut, ValueOut> dynamic_map<Key, Value, Scope, Allocator>::retrieve_
271271
size_t{0});
272272
thrust::device_vector<size_t> submap_cap_prefix_d(submap_cap_prefix);
273273

274-
auto counter = detail::counter_storage<size_t, Scope, Allocator>{this->alloc_};
274+
auto counter =
275+
detail::counter_storage<size_t, Scope, Allocator>{this->alloc_, cuda::stream_ref{stream}};
275276
counter.reset({stream});
276277

277278
detail::retrieve_all<block_size>

include/cuco/detail/hyperloglog/hyperloglog.inl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ constexpr hyperloglog<T, Scope, Hash, Allocator>::hyperloglog(cuco::sketch_size_
2222
Allocator const& alloc,
2323
cuda::stream_ref stream)
2424
: allocator_{alloc},
25-
sketch_{
26-
allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)),
27-
detail::custom_deleter{sketch_bytes(sketch_size_kb) / sizeof(register_type), allocator_}},
25+
sketch_{allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type), stream),
26+
detail::custom_deleter{
27+
sketch_bytes(sketch_size_kb) / sizeof(register_type), allocator_, stream}},
2828
ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(sketch_.get()),
2929
sketch_bytes(sketch_size_kb)},
3030
hash}
@@ -39,9 +39,9 @@ constexpr hyperloglog<T, Scope, Hash, Allocator>::hyperloglog(
3939
Allocator const& alloc,
4040
cuda::stream_ref stream)
4141
: allocator_{alloc},
42-
sketch_{
43-
allocator_.allocate(sketch_bytes(standard_deviation) / sizeof(register_type)),
44-
detail::custom_deleter{sketch_bytes(standard_deviation) / sizeof(register_type), allocator_}},
42+
sketch_{allocator_.allocate(sketch_bytes(standard_deviation) / sizeof(register_type), stream),
43+
detail::custom_deleter{
44+
sketch_bytes(standard_deviation) / sizeof(register_type), allocator_, stream}},
4545
ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(sketch_.get()),
4646
sketch_bytes(standard_deviation)},
4747
hash}

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ class open_addressing_impl {
136136
erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
137137
predicate_{pred},
138138
probing_scheme_{probing_scheme},
139-
storage_{make_valid_extent<probing_scheme_type, Storage>(capacity), alloc}
139+
storage_{make_valid_extent<probing_scheme_type, Storage>(capacity), alloc, stream}
140140
{
141141
this->clear_async(stream);
142142
}
@@ -182,7 +182,8 @@ class open_addressing_impl {
182182
erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
183183
predicate_{pred},
184184
probing_scheme_{probing_scheme},
185-
storage_{make_valid_extent<probing_scheme_type, Storage>(n, desired_load_factor), alloc}
185+
storage_{
186+
make_valid_extent<probing_scheme_type, Storage>(n, desired_load_factor), alloc, stream}
186187
{
187188
this->clear_async(stream);
188189
}
@@ -219,7 +220,7 @@ class open_addressing_impl {
219220
erased_key_sentinel_{erased_key_sentinel},
220221
predicate_{pred},
221222
probing_scheme_{probing_scheme},
222-
storage_{make_valid_extent<probing_scheme_type, Storage>(capacity), alloc}
223+
storage_{make_valid_extent<probing_scheme_type, Storage>(capacity), alloc, stream}
223224
{
224225
CUCO_EXPECTS(this->empty_key_sentinel() != this->erased_key_sentinel(),
225226
"The empty key sentinel and erased key sentinel cannot be the same value.",
@@ -334,7 +335,7 @@ class open_addressing_impl {
334335
if (num_keys == 0) { return 0; }
335336

336337
auto counter =
337-
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
338+
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator(), stream};
338339
counter.reset(stream);
339340

340341
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
@@ -842,8 +843,8 @@ class open_addressing_impl {
842843

843844
cuco::detail::index_type h_num_out{0};
844845
auto temp_allocator = temp_allocator_type{this->allocator()};
845-
auto d_num_out = reinterpret_cast<size_type*>(
846-
std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type)));
846+
auto d_num_out =
847+
reinterpret_cast<size_type*>(temp_allocator.allocate(sizeof(size_type), stream));
847848

848849
// TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
849850
for (cuco::detail::index_type offset = 0;
@@ -869,7 +870,7 @@ class open_addressing_impl {
869870
stream.get()));
870871

871872
// Allocate temporary storage
872-
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
873+
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
873874

874875
CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
875876
temp_storage_bytes,
@@ -889,11 +890,10 @@ class open_addressing_impl {
889890
stream.wait();
890891
#endif
891892
h_num_out += temp_count;
892-
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
893+
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
893894
}
894895

895-
std::allocator_traits<temp_allocator_type>::deallocate(
896-
temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(size_type));
896+
temp_allocator.deallocate(reinterpret_cast<char*>(d_num_out), sizeof(size_type), stream);
897897

898898
return output_begin + h_num_out;
899899
}
@@ -969,7 +969,7 @@ class open_addressing_impl {
969969
[[nodiscard]] size_type size(cuda::stream_ref stream) const
970970
{
971971
auto counter =
972-
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
972+
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator(), stream};
973973
counter.reset(stream);
974974

975975
auto const grid_size = cuco::detail::grid_size(this->capacity());
@@ -1079,7 +1079,7 @@ class open_addressing_impl {
10791079
void rehash_async(extent_type extent, Container const& container, cuda::stream_ref stream)
10801080
{
10811081
auto const old_storage = std::move(this->storage_);
1082-
new (&storage_) storage_type{extent, this->allocator()};
1082+
new (&storage_) storage_type{extent, this->allocator(), stream};
10831083
this->clear_async(stream);
10841084

10851085
auto const num_buckets = old_storage.num_buckets();
@@ -1196,7 +1196,7 @@ class open_addressing_impl {
11961196
if (num_keys == 0) { return 0; }
11971197

11981198
auto counter =
1199-
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
1199+
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator(), stream};
12001200
counter.reset(stream);
12011201

12021202
auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
@@ -1294,7 +1294,7 @@ class open_addressing_impl {
12941294
if (n == 0) { return {output_probe, output_match}; }
12951295

12961296
using counter_type = detail::counter_storage<size_type, thread_scope, allocator_type>;
1297-
auto counter = counter_type{this->allocator()};
1297+
auto counter = counter_type{this->allocator(), stream};
12981298
counter.reset(stream.get());
12991299

13001300
auto constexpr block_size = cuco::detail::default_block_size();

include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,9 @@ class roaring_bitmap_storage<cuda::std::uint32_t, Allocator> {
266266
cuda::stream_ref stream)
267267
: allocator_{alloc},
268268
metadata_{bitmap},
269-
data_{allocator_.allocate(metadata_.size_bytes),
270-
cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
271-
allocator_}},
269+
data_{allocator_.allocate(metadata_.size_bytes, stream),
270+
cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{
271+
metadata_.size_bytes, allocator_, stream}},
272272
ref_{data_.get(), metadata_}
273273
{
274274
CUCO_CUDA_TRY(cudaMemcpyAsync(
@@ -362,12 +362,12 @@ class roaring_bitmap_storage<cuda::std::uint64_t, Allocator> {
362362
[bitmap](std::vector<typename ref_type::metadata_type::bucket_metadata>& bucket_metadata) {
363363
return typename ref_type::metadata_type{bitmap, bucket_metadata};
364364
}(bucket_metadata_)},
365-
data_{allocator_.allocate(metadata_.size_bytes),
366-
cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
367-
allocator_}},
368-
buckets_{bucket_allocator_.allocate(metadata_.num_buckets),
365+
data_{allocator_.allocate(metadata_.size_bytes, stream),
366+
cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{
367+
metadata_.size_bytes, allocator_, stream}},
368+
buckets_{bucket_allocator_.allocate(metadata_.num_buckets, stream),
369369
cuco::detail::custom_deleter<cuda::std::size_t, bucket_allocator_type>{
370-
metadata_.num_buckets, bucket_allocator_}},
370+
metadata_.num_buckets, bucket_allocator_, stream}},
371371
ref_{data_.get(), metadata_, buckets_.get()}
372372
{
373373
assert(metadata_.valid);

include/cuco/detail/static_map.inl

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
4040
slot_allocator_{alloc},
4141
counter_allocator_{alloc}
4242
{
43-
slots_ = std::allocator_traits<slot_allocator_type>::allocate(slot_allocator_, capacity_);
44-
num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
43+
slots_ = slot_allocator_.allocate(capacity_, cuda::stream_ref{stream});
44+
num_successes_ = counter_allocator_.allocate(1, cuda::stream_ref{stream});
4545

4646
auto constexpr block_size = 256;
4747
auto constexpr stride = 4;
@@ -69,8 +69,8 @@ static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
6969
"The empty key sentinel and erased key sentinel cannot be the same value.",
7070
std::runtime_error);
7171

72-
slots_ = std::allocator_traits<slot_allocator_type>::allocate(slot_allocator_, capacity_);
73-
num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
72+
slots_ = slot_allocator_.allocate(capacity_, cuda::stream_ref{stream});
73+
num_successes_ = counter_allocator_.allocate(1, cuda::stream_ref{stream});
7474

7575
auto constexpr block_size = 256;
7676
auto constexpr stride = 4;
@@ -83,8 +83,8 @@ static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
8383
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
8484
static_map<Key, Value, Scope, Allocator>::~static_map()
8585
{
86-
std::allocator_traits<slot_allocator_type>::deallocate(slot_allocator_, slots_, capacity_);
87-
std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, num_successes_, 1);
86+
slot_allocator_.deallocate(slots_, capacity_, cuda::stream_ref{cudaStream_t{nullptr}});
87+
counter_allocator_.deallocate(num_successes_, 1, cuda::stream_ref{cudaStream_t{nullptr}});
8888
}
8989

9090
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -226,7 +226,7 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
226226
typename std::allocator_traits<Allocator>::template rebind_alloc<char>;
227227
auto temp_allocator = temp_allocator_type{slot_allocator_};
228228
auto d_num_out = reinterpret_cast<std::size_t*>(
229-
std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(std::size_t)));
229+
temp_allocator.allocate(sizeof(std::size_t), cuda::stream_ref{stream}));
230230
cub::DeviceSelect::If(nullptr,
231231
temp_storage_bytes,
232232
begin,
@@ -237,8 +237,7 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
237237
stream);
238238

239239
// Allocate temporary storage
240-
auto d_temp_storage =
241-
std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, temp_storage_bytes);
240+
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, cuda::stream_ref{stream});
242241

243242
cub::DeviceSelect::If(d_temp_storage,
244243
temp_storage_bytes,
@@ -253,10 +252,9 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
253252
CUCO_CUDA_TRY(
254253
cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
255254
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
256-
std::allocator_traits<temp_allocator_type>::deallocate(
257-
temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(std::size_t));
258-
std::allocator_traits<temp_allocator_type>::deallocate(
259-
temp_allocator, d_temp_storage, temp_storage_bytes);
255+
temp_allocator.deallocate(
256+
reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});
257+
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, cuda::stream_ref{stream});
260258

261259
return std::make_pair(keys_out + h_num_out, values_out + h_num_out);
262260
}

include/cuco/detail/static_multimap/static_multimap.inl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -787,8 +787,8 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::static_multimap(
787787
empty_key_sentinel_{empty_key_sentinel.value},
788788
empty_value_sentinel_{empty_value_sentinel.value},
789789
allocator_{alloc},
790-
delete_slots_{allocator_, capacity_},
791-
slots_{allocator_.allocate(capacity_), delete_slots_}
790+
delete_slots_{allocator_, capacity_, cuda::stream_ref{stream}},
791+
slots_{allocator_.allocate(capacity_, cuda::stream_ref{stream}), delete_slots_}
792792
{
793793
auto constexpr block_size = 128;
794794
auto constexpr stride = 4;
@@ -909,7 +909,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
909909
auto view = get_device_view();
910910
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
911911

912-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
912+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
913913
counter.reset(stream);
914914

915915
detail::count<block_size, cg_size(), is_outer>
@@ -937,7 +937,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_
937937
auto view = get_device_view();
938938
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
939939

940-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
940+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
941941
counter.reset(stream);
942942

943943
detail::count<block_size, cg_size(), is_outer>
@@ -965,7 +965,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
965965
auto view = get_device_view();
966966
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
967967

968-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
968+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
969969
counter.reset(stream);
970970

971971
detail::pair_count<block_size, cg_size(), is_outer>
@@ -993,7 +993,7 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
993993
auto view = get_device_view();
994994
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
995995

996-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
996+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
997997
counter.reset(stream);
998998

999999
detail::pair_count<block_size, cg_size(), is_outer>
@@ -1026,7 +1026,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
10261026

10271027
auto const grid_size = detail::grid_size(num_keys, cg_size());
10281028

1029-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
1029+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
10301030
counter.reset(stream);
10311031

10321032
detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
@@ -1060,7 +1060,7 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_
10601060

10611061
auto const grid_size = detail::grid_size(num_keys, cg_size());
10621062

1063-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
1063+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
10641064
counter.reset(stream);
10651065

10661066
detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
@@ -1101,7 +1101,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
11011101
}();
11021102
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
11031103

1104-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
1104+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
11051105
counter.reset(stream);
11061106

11071107
detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
@@ -1148,7 +1148,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
11481148
}();
11491149
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
11501150

1151-
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
1151+
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
11521152
counter.reset(stream);
11531153

11541154
detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>

0 commit comments

Comments
 (0)