Skip to content

Commit 144be85

Browse files
committed
Pass ParentT to CG type and pass CG objects by-value
1 parent d091dd6 commit 144be85

37 files changed

+369
-329
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

examples/static_set/device_ref_example.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::s
4242

4343
constexpr auto cg_size = SetRef::cg_size;
4444

45-
auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
45+
auto tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
4646

4747
int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
4848
int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
@@ -60,7 +60,7 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
6060

6161
constexpr auto cg_size = SetRef::cg_size;
6262

63-
auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
63+
auto tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
6464

6565
int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
6666
int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

examples/static_set/device_subsets_example.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ __global__ void insert(ref_type* set_refs)
8080
{
8181
namespace cg = cooperative_groups;
8282

83-
auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
83+
auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
8484
// Get subset (or CG) index
8585
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
8686

@@ -105,7 +105,7 @@ __global__ void find(ref_type* set_refs)
105105
{
106106
namespace cg = cooperative_groups;
107107

108-
auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
108+
auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
109109
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
110110

111111
auto raw_set_ref = *(set_refs + idx);

include/cuco/bloom_filter_ref.cuh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class bloom_filter_ref {
9191
* @param group The Cooperative Group this operation is executed with
9292
*/
9393
template <class CG>
94-
__device__ constexpr void clear(CG const& group);
94+
__device__ constexpr void clear(CG group);
9595

9696
/**
9797
* @brief Erases all information from the filter.
@@ -132,7 +132,7 @@ class bloom_filter_ref {
132132
* @param key The key to be added
133133
*/
134134
template <class CG, class ProbeKey>
135-
__device__ void add(CG const& group, ProbeKey const& key);
135+
__device__ void add(CG group, ProbeKey const& key);
136136

137137
/**
138138
* @brief Device function that adds all keys in the range `[first, last)` to the filter.
@@ -148,7 +148,7 @@ class bloom_filter_ref {
148148
* @param last End of the sequence of keys
149149
*/
150150
template <class CG, class InputIt>
151-
__device__ void add(CG const& group, InputIt first, InputIt last);
151+
__device__ void add(CG group, InputIt first, InputIt last);
152152

153153
/**
154154
* @brief Adds all keys in the range `[first, last)` to the filter.
@@ -255,11 +255,11 @@ class bloom_filter_ref {
255255
* @return `true` iff the key's fingerprint was present in the filter
256256
*/
257257
template <class CG, class ProbeKey>
258-
[[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
258+
[[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const;
259259

260260
// TODO
261261
// template <class CG, class InputIt, class OutputIt>
262-
// __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
262+
// __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
263263
// const;
264264

265265
/**

include/cuco/detail/bloom_filter/bloom_filter_impl.cuh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class bloom_filter_impl {
9595
}
9696

9797
template <class CG>
98-
__device__ constexpr void clear(CG const& group)
98+
__device__ constexpr void clear(CG group)
9999
{
100100
for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) {
101101
words_[i] = 0;
@@ -149,7 +149,7 @@ class bloom_filter_impl {
149149
}
150150

151151
template <class CG, class ProbeKey>
152-
__device__ void add(CG const& group, ProbeKey const& key)
152+
__device__ void add(CG group, ProbeKey const& key)
153153
{
154154
constexpr auto num_threads = tile_size_v<CG>;
155155
constexpr auto optimal_num_threads = add_optimal_cg_size();
@@ -166,7 +166,7 @@ class bloom_filter_impl {
166166
}
167167

168168
template <class CG, class InputIt>
169-
__device__ void add(CG const& group, InputIt first, InputIt last)
169+
__device__ void add(CG group, InputIt first, InputIt last)
170170
{
171171
namespace cg = cooperative_groups;
172172

@@ -208,7 +208,7 @@ class bloom_filter_impl {
208208
typename policy_type::hash_result_type hash_value;
209209
size_type block_index;
210210

211-
auto const worker_group = cg::tiled_partition<worker_num_threads>(group);
211+
auto const worker_group = cg::tiled_partition<worker_num_threads, CG>(group);
212212
auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();
213213

214214
auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
@@ -229,7 +229,7 @@ class bloom_filter_impl {
229229
}
230230

231231
template <class CG, class HashValue, class BlockIndex>
232-
__device__ void add_impl(CG const& group, HashValue const& hash_value, BlockIndex block_index)
232+
__device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index)
233233
{
234234
constexpr auto num_threads = tile_size_v<CG>;
235235

@@ -327,7 +327,7 @@ class bloom_filter_impl {
327327
}
328328

329329
template <class CG, class ProbeKey>
330-
[[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const
330+
[[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const
331331
{
332332
constexpr auto num_threads = tile_size_v<CG>;
333333
constexpr auto optimal_num_threads = contains_optimal_cg_size();
@@ -359,7 +359,7 @@ class bloom_filter_impl {
359359

360360
// TODO
361361
// template <class CG, class InputIt, class OutputIt>
362-
// __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
362+
// __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
363363
// const;
364364

365365
template <class InputIt, class OutputIt>
@@ -432,7 +432,7 @@ class bloom_filter_impl {
432432
// [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
433433
// [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
434434
// template <typename CG, cuda::thread_scope NewScope = thread_scope>
435-
// [[nodiscard]] __device__ constexpr auto make_copy(CG const& group, word_type* const
435+
// [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const
436436
// memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
437437

438438
private:

include/cuco/detail/bloom_filter/bloom_filter_ref.inl

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ __host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloo
3939

4040
template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
4141
template <class CG>
42-
__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG const& group)
42+
__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG group)
4343
{
4444
impl_.clear(group);
4545
}
@@ -66,15 +66,14 @@ __device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(ProbeKey const
6666

6767
template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
6868
template <class CG, class ProbeKey>
69-
__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,
70-
ProbeKey const& key)
69+
__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group, ProbeKey const& key)
7170
{
7271
impl_.add(group, key);
7372
}
7473

7574
template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
7675
template <class CG, class InputIt>
77-
__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,
76+
__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group,
7877
InputIt first,
7978
InputIt last)
8079
{
@@ -125,7 +124,7 @@ template <class ProbeKey>
125124
template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
126125
template <class CG, class ProbeKey>
127126
[[nodiscard]] __device__ bool bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
128-
CG const& group, ProbeKey const& key) const
127+
CG group, ProbeKey const& key) const
129128
{
130129
return impl_.contains(group, key);
131130
}

include/cuco/detail/bloom_filter/kernels.cuh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
4444
if (tile_start >= n) { return; }
4545
auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n;
4646

47-
auto const tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
47+
auto const tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
4848

4949
ref.add(tile, first + tile_start, first + tile_stop);
5050
}
@@ -63,7 +63,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
6363
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
6464
auto idx = cuco::detail::global_thread_id() / CGSize;
6565

66-
[[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
66+
[[maybe_unused]] auto const tile =
67+
cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
6768

6869
while (idx < n) {
6970
if (pred(*(stencil + idx))) {
@@ -94,7 +95,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
9495
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
9596
auto idx = cuco::detail::global_thread_id() / CGSize;
9697

97-
[[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
98+
[[maybe_unused]] auto const tile =
99+
cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
98100

99101
if constexpr (CGSize == 1) {
100102
while (idx < n) {
@@ -103,7 +105,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
103105
idx += loop_stride;
104106
}
105107
} else {
106-
auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
108+
auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
107109
while (idx < n) {
108110
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
109111
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;

include/cuco/detail/dynamic_map_kernels.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ CUCO_KERNEL void insert(InputIt first,
166166
__shared__ typename BlockReduce::TempStorage temp_storage;
167167
std::size_t thread_num_successes = 0;
168168

169-
auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
169+
auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
170170
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
171171
auto it = first + tid / tile_size;
172172

@@ -312,7 +312,7 @@ CUCO_KERNEL void erase(InputIt first,
312312
extern __shared__ unsigned long long submap_block_num_successes[];
313313

314314
auto block = cg::this_thread_block();
315-
auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
315+
auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
316316
auto tid = block_size * block.group_index().x + block.thread_rank();
317317
auto it = first + tid / tile_size;
318318

@@ -456,9 +456,9 @@ CUCO_KERNEL void find(InputIt first,
456456
Hash hash,
457457
KeyEqual key_equal)
458458
{
459-
auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
460-
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
461-
auto key_idx = tid / tile_size;
459+
auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
460+
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
461+
auto key_idx = tid / tile_size;
462462
auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
463463
__shared__ Value writeBuffer[block_size];
464464

@@ -677,7 +677,7 @@ CUCO_KERNEL void contains(InputIt first,
677677
Hash hash,
678678
KeyEqual key_equal)
679679
{
680-
auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
680+
auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
681681
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
682682
auto key_idx = tid / tile_size;
683683
__shared__ bool writeBuffer[block_size];

include/cuco/detail/hyperloglog/hyperloglog_impl.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class hyperloglog_impl {
106106
* @param group CUDA Cooperative group this operation is executed in
107107
*/
108108
template <class CG>
109-
__device__ constexpr void clear(CG const& group) noexcept
109+
__device__ constexpr void clear(CG group) noexcept
110110
{
111111
for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
112112
new (&(this->sketch_[i])) register_type{};
@@ -280,8 +280,7 @@ class hyperloglog_impl {
280280
* @param other Other estimator reference to be merged into `*this`
281281
*/
282282
template <class CG, cuda::thread_scope OtherScope>
283-
__device__ constexpr void merge(CG const& group,
284-
hyperloglog_impl<T, OtherScope, Hash> const& other)
283+
__device__ constexpr void merge(CG group, hyperloglog_impl<T, OtherScope, Hash> const& other)
285284
{
286285
// TODO find a better way to do error handling in device code
287286
// if (other.precision_ != this->precision_) { __trap(); }
@@ -362,7 +361,8 @@ class hyperloglog_impl {
362361
}
363362

364363
// warp reduce Z and V
365-
auto const warp = cooperative_groups::tiled_partition<32>(group);
364+
auto const warp =
365+
cooperative_groups::tiled_partition<32, cooperative_groups::thread_block>(group);
366366
#if defined(CUCO_HAS_CG_REDUCE_UPDATE_ASYNC)
367367
cooperative_groups::reduce_update_async(
368368
warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());

include/cuco/detail/hyperloglog/hyperloglog_ref.inl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ __host__ __device__ constexpr hyperloglog_ref<T, Scope, Hash>::hyperloglog_ref(
2525

2626
template <class T, cuda::thread_scope Scope, class Hash>
2727
template <class CG>
28-
__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG const& group) noexcept
28+
__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG group) noexcept
2929
{
3030
impl_.clear(group);
3131
}
@@ -70,7 +70,7 @@ __host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add(InputIt first,
7070
template <class T, cuda::thread_scope Scope, class Hash>
7171
template <class CG, cuda::thread_scope OtherScope>
7272
__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge(
73-
CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)
73+
CG group, hyperloglog_ref<T, OtherScope, Hash> const& other)
7474
{
7575
impl_.merge(group, other.impl_);
7676
}

0 commit comments

Comments
 (0)