Skip to content

Commit 6d0c28e

Browse files
PointKernelsleeepyjackpre-commit-ci[bot]
authored
Refactor bucket storage to 1D layout for improved performance (#694)
This PR refactors the bucket storage implementation, switching from the previous 2D layout to a 1D layout. Benchmark results show this change reduces branching instructions by 30%, leading to roughly a 10% improvement in runtime performance. --------- Co-authored-by: Daniel Jünger <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent f6e670e commit 6d0c28e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+854
-817
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Large diffs are not rendered by default.

benchmarks/static_multimap/count_bench.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count(
5050

5151
thrust::device_vector<pair_type> pairs(num_keys);
5252
thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
53-
return pair_type(key, {});
53+
return pair_type{key, {}};
5454
});
5555

5656
gen.dropout(keys.begin(), keys.end(), matching_rate);

examples/static_multiset/host_bulk_example.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,4 @@ int main(void)
7979
}
8080

8181
return 0;
82-
}
82+
}

examples/static_set/device_subsets_example.cu

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
*/
1616

1717
#include <cuco/static_set_ref.cuh>
18-
#include <cuco/storage.cuh>
1918

2019
#include <cuda/std/array>
2120
#include <cuda/std/functional>
@@ -143,24 +142,24 @@ int main()
143142
valid_sizes.reserve(num);
144143

145144
for (size_t i = 0; i < num; ++i) {
146-
valid_sizes.emplace_back(
147-
static_cast<std::size_t>(cuco::make_bucket_extent<ref_type>(subset_sizes[i])));
145+
valid_sizes.emplace_back(static_cast<std::size_t>(
146+
cuco::make_valid_extent<probing_scheme_type, storage_ref_type>(subset_sizes[i])));
148147
}
149148

150149
std::vector<std::size_t> offsets(num + 1, 0);
151150

152-
// prefix sum to compute offsets and total number of buckets
151+
// prefix sum to compute offsets and total number of slots
153152
std::size_t current_sum = 0;
154153
for (std::size_t i = 0; i < valid_sizes.size(); ++i) {
155154
current_sum += valid_sizes[i];
156155
offsets[i + 1] = current_sum;
157156
}
158157

159-
// total number of buckets is located at the back of the offsets array
160-
auto const total_num_buckets = offsets.back();
158+
// total number of slots is located at the back of the offsets array
159+
auto const total_num_slots = offsets.back();
161160

162161
// Create a single bulk storage used by all subsets
163-
auto set_storage = storage_type{total_num_buckets};
162+
auto set_storage = storage_type{total_num_slots};
164163
// Initializes the storage with the given sentinel
165164
set_storage.initialize(empty_key_sentinel);
166165

examples/static_set/shared_memory_example.cu

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -26,16 +26,16 @@
2626
*/
2727

2828
template <class SetRef>
29-
__global__ void shmem_set_kernel(typename SetRef::extent_type bucket_extent,
29+
__global__ void shmem_set_kernel(typename SetRef::extent_type valid_extent,
3030
cuco::empty_key<typename SetRef::key_type> empty_key_sentinel)
3131
{
3232
// We first allocate the shared memory storage for the `set`.
3333
// The storage is comprised of contiguous buckets of slots,
3434
// which allow for vectorized loads.
35-
__shared__ typename SetRef::bucket_type buckets[bucket_extent.value()];
35+
__shared__ typename SetRef::value_type slots[valid_extent.value()];
3636

3737
// Next, we construct the actual storage object from the raw array.
38-
auto storage = SetRef::storage_ref_type(bucket_extent, buckets);
38+
auto storage = SetRef::storage_ref_type(valid_extent, slots);
3939
// Now we can instantiate the set from the storage.
4040
auto set = SetRef(empty_key_sentinel, {}, {}, {}, storage);
4141

@@ -103,9 +103,10 @@ int main(void)
103103
// Cuco imposes a number of non-trivial contraints on the capacity value.
104104
// This function will take the requested capacity (1000) and return the next larger
105105
// valid extent.
106-
auto constexpr bucket_extent = cuco::make_bucket_extent<set_ref_type>(extent_type{});
106+
auto constexpr valid_extent =
107+
cuco::make_valid_extent<probing_scheme_type, cuco::storage<bucket_size>>(extent_type{});
107108

108109
// Launch the kernel with a single thread block.
109-
shmem_set_kernel<set_ref_type><<<1, 128>>>(bucket_extent, empty_key_sentinel);
110+
shmem_set_kernel<set_ref_type><<<1, 128>>>(valid_extent, empty_key_sentinel);
110111
cudaDeviceSynchronize();
111112
}

include/cuco/bucket_storage.cuh

Lines changed: 94 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616

1717
#pragma once
1818

19-
#include <cuco/detail/storage/bucket_storage_base.cuh>
19+
#include <cuco/detail/storage/storage_base.cuh>
2020
#include <cuco/extent.cuh>
2121
#include <cuco/utility/allocator.hpp>
2222

2323
#include <cuda/std/array>
24+
#include <cuda/std/functional>
2425
#include <cuda/stream_ref>
2526

2627
#include <cstddef>
@@ -29,49 +30,36 @@
2930
#include <memory>
3031

3132
namespace cuco {
32-
/// Bucket type alias
33-
template <typename T, int32_t BucketSize>
34-
using bucket = detail::bucket<T, BucketSize>;
35-
3633
/**
37-
* @brief Non-owning array of buckets storage reference type.
34+
* @brief Non-owning array of slots storage reference type.
3835
*
3936
* @tparam T Storage element type
4037
* @tparam BucketSize Number of slots in each bucket
4138
* @tparam Extent Type of extent denoting storage capacity
4239
*/
4340
template <typename T, int32_t BucketSize, typename Extent = cuco::extent<std::size_t>>
44-
class bucket_storage_ref : public detail::bucket_storage_base<T, BucketSize, Extent> {
41+
class bucket_storage_ref {
4542
public:
46-
/// Array of buckets base class type
47-
using base_type = detail::bucket_storage_base<T, BucketSize, Extent>;
48-
49-
using base_type::bucket_size; ///< Number of elements processed per bucket
50-
51-
using extent_type = typename base_type::extent_type; ///< Storage extent type
52-
using size_type = typename base_type::size_type; ///< Storage size type
53-
using value_type = typename base_type::value_type; ///< Slot type
54-
using bucket_type = typename base_type::bucket_type; ///< Slot bucket type
43+
static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket
44+
static constexpr std::size_t alignment =
45+
cuda::std::min(sizeof(T) * bucket_size, std::size_t{16}); ///< Required alignment
5546

56-
using base_type::capacity;
57-
using base_type::num_buckets;
47+
using extent_type = Extent; ///< Storage extent type
48+
using size_type = typename extent_type::value_type; ///< Storage size type
49+
using value_type = T; ///< Slot type
50+
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type
5851

5952
/**
60-
* @brief Constructor of AoS storage ref.
53+
* @brief Constructor of slot storage ref.
6154
*
62-
* @param size Number of buckets
63-
* @param buckets Pointer to the buckets array
55+
* @param size Number of slots
56+
* @param slots Pointer to the slots array
6457
*/
6558
__host__ __device__ explicit constexpr bucket_storage_ref(Extent size,
66-
bucket_type* buckets) noexcept;
59+
value_type* slots) noexcept;
6760

68-
/**
69-
* @brief Custom un-incrementable input iterator for the convenience of `find` operations.
70-
*
71-
* @note This iterator is for read only and NOT incrementable.
72-
*/
73-
struct iterator;
74-
using const_iterator = iterator const; ///< Const forward iterator type
61+
using iterator = value_type*; ///< Iterator type
62+
using const_iterator = value_type const*; ///< Const forward iterator type
7563

7664
/**
7765
* @brief Returns an iterator to one past the last slot.
@@ -91,76 +79,90 @@ class bucket_storage_ref : public detail::bucket_storage_base<T, BucketSize, Ext
9179
*
9280
* @return A const_iterator to one past the last slot
9381
*/
94-
[[nodiscard]] __device__ constexpr const_iterator end() const noexcept;
82+
[[nodiscard]] __device__ constexpr iterator end() const noexcept;
9583

9684
/**
97-
* @brief Gets buckets array.
85+
* @brief Gets slots array.
9886
*
99-
* @return Pointer to the first bucket
87+
* @return Pointer to the first slot
10088
*/
101-
[[nodiscard]] __device__ constexpr bucket_type* data() noexcept;
89+
[[nodiscard]] __device__ constexpr value_type* data() noexcept;
10290

10391
/**
104-
* @brief Gets bucket array.
92+
* @brief Gets slots array.
10593
*
106-
* @return Pointer to the first bucket
94+
* @return Pointer to the first slot
10795
*/
108-
[[nodiscard]] __device__ constexpr bucket_type* data() const noexcept;
96+
[[nodiscard]] __device__ constexpr value_type* data() const noexcept;
10997

11098
/**
11199
* @brief Returns an array of slots (or a bucket) for a given index.
112100
*
113-
* @param index Index of the bucket
101+
* @param index Index of the slot
114102
* @return An array of slots
115103
*/
116104
[[nodiscard]] __device__ constexpr bucket_type operator[](size_type index) const noexcept;
117105

106+
/**
107+
* @brief Gets the total number of slot buckets in the current storage.
108+
*
109+
* @return The total number of slot buckets
110+
*/
111+
[[nodiscard]] __host__ __device__ constexpr size_type num_buckets() const noexcept;
112+
113+
/**
114+
* @brief Gets the total number of slots in the current storage.
115+
*
116+
* @return The total number of slots
117+
*/
118+
[[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept;
119+
120+
/**
121+
* @brief Gets the bucket extent of the current storage.
122+
*
123+
* @return The bucket extent.
124+
*/
125+
[[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept;
126+
118127
private:
119-
bucket_type* buckets_; ///< Pointer to the buckets array
128+
extent_type extent_; ///< Storage extent
129+
value_type* slots_; ///< Pointer to the slots array
120130
};
121131

122132
/**
123-
* @brief Array of buckets open addressing storage class.
133+
* @brief Array of slots open addressing storage class.
124134
*
125135
* @tparam T Slot type
126136
* @tparam BucketSize Number of slots in each bucket
127-
* @tparam Extent Type of extent denoting number of buckets
137+
* @tparam Extent Type of extent denoting number of slots
128138
* @tparam Allocator Type of allocator used for device storage (de)allocation
129139
*/
130140
template <typename T,
131141
int32_t BucketSize,
132142
typename Extent = cuco::extent<std::size_t>,
133-
typename Allocator = cuco::cuda_allocator<cuco::bucket<T, BucketSize>>>
134-
class bucket_storage : public detail::bucket_storage_base<T, BucketSize, Extent> {
143+
typename Allocator = cuco::cuda_allocator<T>>
144+
class bucket_storage {
135145
public:
136-
/// Array of buckets base class type
137-
using base_type = detail::bucket_storage_base<T, BucketSize, Extent>;
138-
139-
using base_type::bucket_size; ///< Number of elements processed per bucket
146+
static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket
140147

141-
using extent_type = typename base_type::extent_type; ///< Storage extent type
142-
using size_type = typename base_type::size_type; ///< Storage size type
143-
using value_type = typename base_type::value_type; ///< Slot type
144-
using bucket_type = typename base_type::bucket_type; ///< Slot bucket type
145-
146-
using base_type::capacity;
147-
using base_type::num_buckets;
148+
using extent_type = Extent; ///< Storage extent type
149+
using size_type = typename extent_type::value_type; ///< Storage size type
150+
using value_type = T; ///< Slot type
151+
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type
148152

149153
/// Type of the allocator to (de)allocate buckets
150154
using allocator_type =
151-
typename std::allocator_traits<Allocator>::template rebind_alloc<bucket_type>;
152-
using bucket_deleter_type =
153-
detail::custom_deleter<size_type, allocator_type>; ///< Type of bucket deleter
155+
typename std::allocator_traits<Allocator>::template rebind_alloc<value_type>;
154156
using ref_type = bucket_storage_ref<value_type, bucket_size, extent_type>; ///< Storage ref type
155157

156158
/**
157-
* @brief Constructor of bucket storage.
159+
* @brief Constructor of bucket slot storage.
158160
*
159161
* @note The input `size` should be exclusively determined by the return value of
160-
* `make_bucket_extent` since it depends on the requested low-bound value, the probing scheme, and
162+
* `make_valid_extent` since it depends on the requested low-bound value, the probing scheme, and
161163
* the storage.
162164
*
163-
* @param size Number of buckets to (de)allocate
165+
* @param size Number of slots to (de)allocate
164166
* @param allocator Allocator used for (de)allocating device storage
165167
*/
166168
explicit constexpr bucket_storage(Extent size, Allocator const& allocator = {});
@@ -178,11 +180,11 @@ class bucket_storage : public detail::bucket_storage_base<T, BucketSize, Extent>
178180
bucket_storage& operator=(bucket_storage const&) = delete;
179181

180182
/**
181-
* @brief Gets buckets array.
183+
* @brief Gets bucket slots array.
182184
*
183-
* @return Pointer to the first bucket
185+
* @return Pointer to the first slot
184186
*/
185-
[[nodiscard]] constexpr bucket_type* data() const noexcept;
187+
[[nodiscard]] constexpr value_type* data() const noexcept;
186188

187189
/**
188190
* @brief Gets the storage allocator.
@@ -199,7 +201,7 @@ class bucket_storage : public detail::bucket_storage_base<T, BucketSize, Extent>
199201
[[nodiscard]] constexpr ref_type ref() const noexcept;
200202

201203
/**
202-
* @brief Initializes each slot in the bucket storage to contain `key`.
204+
* @brief Initializes each slot in the bucket slot storage to contain `key`.
203205
*
204206
* @param key Key to which all keys in `slots` are initialized
205207
* @param stream Stream used for executing the kernel
@@ -212,13 +214,37 @@ class bucket_storage : public detail::bucket_storage_base<T, BucketSize, Extent>
212214
* @param key Key to which all keys in `slots` are initialized
213215
* @param stream Stream used for executing the kernel
214216
*/
215-
void initialize_async(value_type key, cuda::stream_ref stream = {}) noexcept;
217+
void initialize_async(value_type key, cuda::stream_ref stream = {});
218+
219+
/**
220+
* @brief Gets the total number of slot buckets in the current storage.
221+
*
222+
* @return The total number of slot buckets
223+
*/
224+
[[nodiscard]] __host__ __device__ constexpr size_type num_buckets() const noexcept;
225+
226+
/**
227+
* @brief Gets the total number of slots in the current storage.
228+
*
229+
* @return The total number of slots
230+
*/
231+
[[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept;
232+
233+
/**
234+
* @brief Gets the bucket extent of the current storage.
235+
*
236+
* @return The bucket extent.
237+
*/
238+
[[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept;
216239

217240
private:
218-
allocator_type allocator_; ///< Allocator used to (de)allocate buckets
219-
bucket_deleter_type bucket_deleter_; ///< Custom buckets deleter
220-
/// Pointer to the bucket storage
221-
std::unique_ptr<bucket_type, bucket_deleter_type> buckets_;
241+
using slot_deleter_type =
242+
detail::custom_deleter<size_type, allocator_type>; ///< Type of slot deleter
243+
244+
extent_type extent_; ///< Storage extent
245+
allocator_type allocator_; ///< Allocator used to (de)allocate slots
246+
/// Pointer to the slot storage
247+
std::unique_ptr<value_type, slot_deleter_type> slots_;
222248
};
223249
} // namespace cuco
224250

0 commit comments

Comments
 (0)