Skip to content

Commit 137f454

Browse files
authored
Merge branch 'dev' into fix-docs
2 parents b5b1286 + 64a8c21 commit 137f454

File tree

15 files changed

+190
-180
lines changed

15 files changed

+190
-180
lines changed

include/cuco/detail/dynamic_map.inl

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
* limitations under the License.
1515
*/
1616

17+
#include <cstdint>
18+
1719
namespace cuco {
1820

1921
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -77,42 +79,52 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
7779
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
7880
void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStream_t stream)
7981
{
80-
int64_t num_elements_remaining = n;
81-
uint32_t submap_idx = 0;
82-
while (num_elements_remaining > 0) {
83-
std::size_t submap_capacity;
84-
85-
// if the submap already exists
86-
if (submap_idx < submaps_.size()) {
87-
submap_capacity = submaps_[submap_idx]->get_capacity();
82+
// Calculate current total available capacity across all submaps
83+
std::size_t total_available_capacity = 0;
84+
for (std::size_t i = 0; i < submaps_.size(); ++i) {
85+
std::size_t submap_usable_capacity =
86+
static_cast<std::size_t>(max_load_factor_ * submaps_[i]->get_capacity());
87+
// Only count capacity above the minimum insert threshold
88+
if (submap_usable_capacity >= min_insert_size_) {
89+
total_available_capacity += submap_usable_capacity - min_insert_size_;
8890
}
89-
// if the submap does not exist yet, create it
90-
else {
91-
submap_capacity = capacity_;
92-
if (erased_key_sentinel_ != empty_key_sentinel_) {
93-
submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
94-
submap_capacity,
95-
empty_key<Key>{empty_key_sentinel_},
96-
empty_value<Value>{empty_value_sentinel_},
97-
erased_key<Key>{erased_key_sentinel_},
98-
alloc_,
99-
stream));
100-
} else {
101-
submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
102-
submap_capacity,
103-
empty_key<Key>{empty_key_sentinel_},
104-
empty_value<Value>{empty_value_sentinel_},
105-
alloc_,
106-
stream));
107-
}
108-
submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_);
109-
submap_views_.push_back(submaps_[submap_idx]->get_device_view());
110-
submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
111-
capacity_ *= 2;
91+
}
92+
93+
// Create new submaps until we have enough capacity
94+
while (total_available_capacity < n) {
95+
std::size_t new_submap_capacity = capacity_;
96+
97+
if (erased_key_sentinel_ != empty_key_sentinel_) {
98+
submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
99+
new_submap_capacity,
100+
empty_key<Key>{empty_key_sentinel_},
101+
empty_value<Value>{empty_value_sentinel_},
102+
erased_key<Key>{erased_key_sentinel_},
103+
alloc_,
104+
stream));
105+
} else {
106+
submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
107+
new_submap_capacity,
108+
empty_key<Key>{empty_key_sentinel_},
109+
empty_value<Value>{empty_value_sentinel_},
110+
alloc_,
111+
stream));
112112
}
113113

114-
num_elements_remaining -= max_load_factor_ * submap_capacity - min_insert_size_;
115-
submap_idx++;
114+
std::size_t submap_idx = submaps_.size() - 1;
115+
submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_);
116+
submap_views_.push_back(submaps_[submap_idx]->get_device_view());
117+
submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
118+
119+
// Add the new submap's usable capacity
120+
std::size_t new_usable_capacity =
121+
static_cast<std::size_t>(max_load_factor_ * new_submap_capacity);
122+
if (new_usable_capacity >= min_insert_size_) {
123+
total_available_capacity += new_usable_capacity - min_insert_size_;
124+
}
125+
126+
// Update capacity for next submap (double the size)
127+
capacity_ *= 2;
116128
}
117129
}
118130

@@ -133,7 +145,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
133145

134146
reserve(size_ + num_to_insert, stream);
135147

136-
uint32_t submap_idx = 0;
148+
std::size_t submap_idx = 0;
137149
while (num_to_insert > 0) {
138150
std::size_t capacity_remaining =
139151
max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size();
@@ -189,7 +201,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
189201
auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
190202

191203
// zero out submap success counters
192-
for (uint32_t i = 0; i < submaps_.size(); ++i) {
204+
for (std::size_t i = 0; i < submaps_.size(); ++i) {
193205
CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream));
194206
}
195207

@@ -204,7 +216,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
204216
hash,
205217
key_equal);
206218

207-
for (uint32_t i = 0; i < submaps_.size(); ++i) {
219+
for (std::size_t i = 0; i < submaps_.size(); ++i) {
208220
std::size_t h_submap_num_successes;
209221
CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes,
210222
submap_num_successes_[i],

include/cuco/detail/dynamic_map/dynamic_map.inl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
8282
auto num_to_insert = cuco::detail::distance(first, last);
8383
this->reserve(size_ + num_to_insert, stream);
8484

85-
uint32_t submap_idx = 0;
85+
std::size_t submap_idx = 0;
8686
while (num_to_insert > 0) {
8787
auto& cur = submaps_[submap_idx];
8888

@@ -114,7 +114,7 @@ void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
114114
size_type n, cuda::stream_ref stream)
115115
{
116116
size_type num_elements_remaining = n;
117-
uint32_t submap_idx = 0;
117+
std::size_t submap_idx = 0;
118118
while (num_elements_remaining > 0) {
119119
std::size_t submap_capacity;
120120

@@ -157,9 +157,9 @@ template <typename InputIt, typename OutputIt>
157157
void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
158158
InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
159159
{
160-
auto num_keys = cuco::detail::distance(first, last);
161-
std::size_t traversed = 0;
162-
uint32_t submap_idx = 0;
160+
auto num_keys = cuco::detail::distance(first, last);
161+
std::size_t traversed = 0;
162+
std::size_t submap_idx = 0;
163163
while (num_keys > 0 && submap_idx < submaps_.size()) {
164164
const auto& cur = submaps_[submap_idx];
165165
const size_t cur_size = cur->size();

include/cuco/detail/open_addressing/kernels.cuh

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ CUCO_SUPPRESS_KERNEL_WARNINGS
5555
* @param num_successes Number of successful inserted elements
5656
* @param ref Non-owning container device ref used to access the slot storage
5757
*/
58-
template <int32_t CGSize,
59-
int32_t BlockSize,
58+
template <int CGSize,
59+
int BlockSize,
6060
typename InputIt,
6161
typename StencilIt,
6262
typename Predicate,
@@ -123,8 +123,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
123123
* @param pred Predicate to test on every element in the range `[stencil, stencil + n)`
124124
* @param ref Non-owning container device ref used to access the slot storage
125125
*/
126-
template <int32_t CGSize,
127-
int32_t BlockSize,
126+
template <int CGSize,
127+
int BlockSize,
128128
typename InputIt,
129129
typename StencilIt,
130130
typename Predicate,
@@ -164,7 +164,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
164164
* @param n Number of input elements
165165
* @param ref Non-owning container device ref used to access the slot storage
166166
*/
167-
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Ref>
167+
template <int CGSize, int BlockSize, typename InputIt, typename Ref>
168168
CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
169169
cuco::detail::index_type n,
170170
Ref ref)
@@ -204,7 +204,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
204204
* @param callback_op Function to call on every matched slot found in the container
205205
* @param ref Non-owning container device ref used to access the slot storage
206206
*/
207-
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename CallbackOp, typename Ref>
207+
template <int CGSize, int BlockSize, typename InputIt, typename CallbackOp, typename Ref>
208208
CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
209209
cuco::detail::index_type n,
210210
CallbackOp callback_op,
@@ -252,8 +252,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
252252
* @param output_begin Beginning of the sequence of booleans for the presence of each key
253253
* @param ref Non-owning container device ref used to access the slot storage
254254
*/
255-
template <int32_t CGSize,
256-
int32_t BlockSize,
255+
template <int CGSize,
256+
int BlockSize,
257257
typename InputIt,
258258
typename StencilIt,
259259
typename Predicate,
@@ -348,8 +348,8 @@ struct find_buffer<Container, cuda::std::void_t<typename Container::mapped_type>
348348
* @param output_begin Beginning of the sequence of matched payloads retrieved for each key
349349
* @param ref Non-owning container device ref used to access the slot storage
350350
*/
351-
template <int32_t CGSize,
352-
int32_t BlockSize,
351+
template <int CGSize,
352+
int BlockSize,
353353
typename InputIt,
354354
typename StencilIt,
355355
typename Predicate,
@@ -446,8 +446,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
446446
* @param inserted_begin Beginning of the sequence of booleans for the presence of each key
447447
* @param ref Non-owning container device ref used to access the slot storage
448448
*/
449-
template <int32_t CGSize,
450-
int32_t BlockSize,
449+
template <int CGSize,
450+
int BlockSize,
451451
typename InputIt,
452452
typename FoundIt,
453453
typename InsertedIt,
@@ -532,12 +532,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
532532
* @param count Number of matches
533533
* @param ref Non-owning container device ref used to access the slot storage
534534
*/
535-
template <bool IsOuter,
536-
int32_t CGSize,
537-
int32_t BlockSize,
538-
typename InputIt,
539-
typename AtomicT,
540-
typename Ref>
535+
template <bool IsOuter, int CGSize, int BlockSize, typename InputIt, typename AtomicT, typename Ref>
541536
CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
542537
cuco::detail::index_type n,
543538
AtomicT* count,
@@ -598,8 +593,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
598593
* @param ref Non-owning container device ref used to access the slot storage
599594
*/
600595
template <bool IsOuter,
601-
int32_t CGSize,
602-
int32_t BlockSize,
596+
int CGSize,
597+
int BlockSize,
603598
typename InputIt,
604599
typename OutputIt,
605600
typename Ref>
@@ -671,7 +666,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count_each(InputIt first,
671666
* @param ref Non-owning container device ref used to access the slot storage
672667
*/
673668
template <bool IsOuter,
674-
int32_t BlockSize,
669+
int BlockSize,
675670
class InputProbeIt,
676671
class OutputProbeIt,
677672
class OutputMatchIt,
@@ -724,7 +719,7 @@ CUCO_KERNEL void retrieve(InputProbeIt input_probe,
724719
* @param is_filled Predicate indicating if the given slot is filled
725720
* @param count Number of filled slots
726721
*/
727-
template <int32_t BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
722+
template <int BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
728723
CUCO_KERNEL __launch_bounds__(BlockSize) void size(StorageRef storage,
729724
Predicate is_filled,
730725
AtomicT* count)
@@ -749,7 +744,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void size(StorageRef storage,
749744
if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
750745
}
751746

752-
template <int32_t BlockSize, typename ContainerRef, typename Predicate>
747+
template <int BlockSize, typename ContainerRef, typename Predicate>
753748
CUCO_KERNEL __launch_bounds__(BlockSize) void rehash(
754749
typename ContainerRef::storage_ref_type storage_ref,
755750
ContainerRef container_ref,
@@ -796,4 +791,4 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void rehash(
796791
idx += loop_stride;
797792
}
798793
}
799-
} // namespace cuco::detail::open_addressing_ns
794+
} // namespace cuco::detail::open_addressing_ns

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <thrust/iterator/transform_iterator.h>
3838

3939
#include <cmath>
40+
#include <cstdint>
4041

4142
namespace cuco {
4243
namespace detail {
@@ -834,7 +835,7 @@ class open_addressing_impl {
834835
using temp_allocator_type =
835836
typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
836837

837-
cuco::detail::index_type constexpr stride = std::numeric_limits<int32_t>::max();
838+
cuco::detail::index_type constexpr stride = std::numeric_limits<std::int32_t>::max();
838839

839840
cuco::detail::index_type h_num_out{0};
840841
auto temp_allocator = temp_allocator_type{this->allocator()};
@@ -860,7 +861,7 @@ class open_addressing_impl {
860861
begin,
861862
output_begin + h_num_out,
862863
d_num_out,
863-
static_cast<int32_t>(num_items),
864+
static_cast<std::int32_t>(num_items),
864865
is_filled,
865866
stream.get()));
866867

@@ -872,7 +873,7 @@ class open_addressing_impl {
872873
begin,
873874
output_begin + h_num_out,
874875
d_num_out,
875-
static_cast<int32_t>(num_items),
876+
static_cast<std::int32_t>(num_items),
876877
is_filled,
877878
stream.get()));
878879

@@ -1281,7 +1282,7 @@ class open_addressing_impl {
12811282
auto counter = counter_type{this->allocator()};
12821283
counter.reset(stream.get());
12831284

1284-
int32_t constexpr block_size = cuco::detail::default_block_size();
1285+
auto constexpr block_size = cuco::detail::default_block_size();
12851286

12861287
auto constexpr grid_stride = 1;
12871288
auto const grid_size = cuco::detail::grid_size(n, cg_size, grid_stride, block_size);

0 commit comments

Comments
 (0)