NVIDIA
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/static_set/device_ref_example.cu‎
Lines changed: 2 additions & 2 deletions b/‎examples/static_set/device_ref_example.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/static_set/device_subsets_example.cu‎
Lines changed: 2 additions & 2 deletions b/‎examples/static_set/device_subsets_example.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/cuco/bloom_filter_ref.cuh‎
Lines changed: 5 additions & 5 deletions b/‎include/cuco/bloom_filter_ref.cuh‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 8 additions & 8 deletions b/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎include/cuco/detail/bloom_filter/bloom_filter_ref.inl‎
Lines changed: 4 additions & 5 deletions b/‎include/cuco/detail/bloom_filter/bloom_filter_ref.inl‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 6 additions & 4 deletions b/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎include/cuco/detail/dynamic_map_kernels.cuh‎
Lines changed: 6 additions & 6 deletions b/‎include/cuco/detail/dynamic_map_kernels.cuh‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎include/cuco/detail/hyperloglog/hyperloglog_impl.cuh‎
Lines changed: 4 additions & 4 deletions b/‎include/cuco/detail/hyperloglog/hyperloglog_impl.cuh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎include/cuco/detail/hyperloglog/hyperloglog_ref.inl‎
Lines changed: 2 additions & 2 deletions b/‎include/cuco/detail/hyperloglog/hyperloglog_ref.inl‎
Lines changed: 2 additions & 2 deletions
@@ -42,7 +42,7 @@ __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::s
 
   constexpr auto cg_size = SetRef::cg_size;
 
-  auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
 
   int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
   int64_t idx               = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
@@ -60,7 +60,7 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
 
   constexpr auto cg_size = SetRef::cg_size;
 
-  auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
 
   int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
   int64_t idx               = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
 
@@ -80,7 +80,7 @@ __global__ void insert(ref_type* set_refs)
 {
   namespace cg = cooperative_groups;
 
-  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
   // Get subset (or CG) index
   auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
 
@@ -105,7 +105,7 @@ __global__ void find(ref_type* set_refs)
 {
   namespace cg = cooperative_groups;
 
-  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
   auto const idx  = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
 
   auto raw_set_ref  = *(set_refs + idx);
 
@@ -91,7 +91,7 @@ class bloom_filter_ref {
    * @param group The Cooperative Group this operation is executed with
    */
   template <class CG>
-  __device__ constexpr void clear(CG const& group);
+  __device__ constexpr void clear(CG group);
 
   /**
    * @brief Erases all information from the filter.
@@ -132,7 +132,7 @@ class bloom_filter_ref {
    * @param key The key to be added
    */
   template <class CG, class ProbeKey>
-  __device__ void add(CG const& group, ProbeKey const& key);
+  __device__ void add(CG group, ProbeKey const& key);
 
   /**
    * @brief Device function that adds all keys in the range `[first, last)` to the filter.
@@ -148,7 +148,7 @@ class bloom_filter_ref {
    * @param last End of the sequence of keys
    */
   template <class CG, class InputIt>
-  __device__ void add(CG const& group, InputIt first, InputIt last);
+  __device__ void add(CG group, InputIt first, InputIt last);
 
   /**
    * @brief Adds all keys in the range `[first, last)` to the filter.
@@ -255,11 +255,11 @@ class bloom_filter_ref {
    * @return `true` iff the key's fingerprint was present in the filter
    */
   template <class CG, class ProbeKey>
-  [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
+  [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const;
 
   // TODO
   // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
+  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
   // const;
 
   /**
 
@@ -95,7 +95,7 @@ class bloom_filter_impl {
   }
 
   template <class CG>
-  __device__ constexpr void clear(CG const& group)
+  __device__ constexpr void clear(CG group)
   {
     for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) {
       words_[i] = 0;
@@ -149,7 +149,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class ProbeKey>
-  __device__ void add(CG const& group, ProbeKey const& key)
+  __device__ void add(CG group, ProbeKey const& key)
   {
     constexpr auto num_threads         = tile_size_v<CG>;
     constexpr auto optimal_num_threads = add_optimal_cg_size();
@@ -166,7 +166,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class InputIt>
-  __device__ void add(CG const& group, InputIt first, InputIt last)
+  __device__ void add(CG group, InputIt first, InputIt last)
   {
     namespace cg = cooperative_groups;
 
@@ -208,7 +208,7 @@ class bloom_filter_impl {
       typename policy_type::hash_result_type hash_value;
       size_type block_index;
 
-      auto const worker_group  = cg::tiled_partition<worker_num_threads>(group);
+      auto const worker_group  = cg::tiled_partition<worker_num_threads, CG>(group);
       auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();
 
       auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
@@ -229,7 +229,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class HashValue, class BlockIndex>
-  __device__ void add_impl(CG const& group, HashValue const& hash_value, BlockIndex block_index)
+  __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index)
   {
     constexpr auto num_threads = tile_size_v<CG>;
 
@@ -327,7 +327,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class ProbeKey>
-  [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const
+  [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const
   {
     constexpr auto num_threads         = tile_size_v<CG>;
     constexpr auto optimal_num_threads = contains_optimal_cg_size();
@@ -359,7 +359,7 @@ class bloom_filter_impl {
 
   // TODO
   // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
+  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
   // const;
 
   template <class InputIt, class OutputIt>
@@ -432,7 +432,7 @@ class bloom_filter_impl {
   // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
   // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
   // template <typename CG, cuda::thread_scope NewScope = thread_scope>
-  // [[nodiscard]] __device__ constexpr auto make_copy(CG const& group, word_type* const
+  // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const
   // memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
  private:
 
@@ -39,7 +39,7 @@ __host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloo
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG>
-__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG const& group)
+__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG group)
 {
   impl_.clear(group);
 }
@@ -66,15 +66,14 @@ __device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(ProbeKey const
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class ProbeKey>
-__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,
-                                                                  ProbeKey const& key)
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group, ProbeKey const& key)
 {
   impl_.add(group, key);
 }
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class InputIt>
-__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group,
                                                                   InputIt first,
                                                                   InputIt last)
 {
@@ -125,7 +124,7 @@ template <class ProbeKey>
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class ProbeKey>
 [[nodiscard]] __device__ bool bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
-  CG const& group, ProbeKey const& key) const
+  CG group, ProbeKey const& key) const
 {
   return impl_.contains(group, key);
 }
 
@@ -44,7 +44,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
   if (tile_start >= n) { return; }
   auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n;
 
-  auto const tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto const tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
 
   ref.add(tile, first + tile_start, first + tile_stop);
 }
@@ -63,7 +63,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
-  [[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+  [[maybe_unused]] auto const tile =
+    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
@@ -94,7 +95,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
-  [[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+  [[maybe_unused]] auto const tile =
+    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
 
   if constexpr (CGSize == 1) {
     while (idx < n) {
@@ -103,7 +105,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
       idx += loop_stride;
     }
   } else {
-    auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
     while (idx < n) {
       typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
       auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
 
@@ -166,7 +166,7 @@ CUCO_KERNEL void insert(InputIt first,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   auto tid  = blockDim.x * blockIdx.x + threadIdx.x;
   auto it   = first + tid / tile_size;
 
@@ -312,7 +312,7 @@ CUCO_KERNEL void erase(InputIt first,
   extern __shared__ unsigned long long submap_block_num_successes[];
 
   auto block = cg::this_thread_block();
-  auto tile  = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile  = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   auto tid   = block_size * block.group_index().x + block.thread_rank();
   auto it    = first + tid / tile_size;
 
@@ -456,9 +456,9 @@ CUCO_KERNEL void find(InputIt first,
                       Hash hash,
                       KeyEqual key_equal)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid                  = blockDim.x * blockIdx.x + threadIdx.x;
-  auto key_idx              = tid / tile_size;
+  auto tile    = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
+  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
   auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
   __shared__ Value writeBuffer[block_size];
 
@@ -677,7 +677,7 @@ CUCO_KERNEL void contains(InputIt first,
                           Hash hash,
                           KeyEqual key_equal)
 {
-  auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile    = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
   auto key_idx = tid / tile_size;
   __shared__ bool writeBuffer[block_size];
 
@@ -106,7 +106,7 @@ class hyperloglog_impl {
    * @param group CUDA Cooperative group this operation is executed in
    */
   template <class CG>
-  __device__ constexpr void clear(CG const& group) noexcept
+  __device__ constexpr void clear(CG group) noexcept
   {
     for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
       new (&(this->sketch_[i])) register_type{};
@@ -280,8 +280,7 @@ class hyperloglog_impl {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ constexpr void merge(CG const& group,
-                                  hyperloglog_impl<T, OtherScope, Hash> const& other)
+  __device__ constexpr void merge(CG group, hyperloglog_impl<T, OtherScope, Hash> const& other)
   {
     // TODO find a better way to do error handling in device code
     // if (other.precision_ != this->precision_) { __trap(); }
@@ -362,7 +361,8 @@ class hyperloglog_impl {
     }
 
     // warp reduce Z and V
-    auto const warp = cooperative_groups::tiled_partition<32>(group);
+    auto const warp =
+      cooperative_groups::tiled_partition<32, cooperative_groups::thread_block>(group);
 #if defined(CUCO_HAS_CG_REDUCE_UPDATE_ASYNC)
     cooperative_groups::reduce_update_async(
       warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
 
@@ -25,7 +25,7 @@ __host__ __device__ constexpr hyperloglog_ref<T, Scope, Hash>::hyperloglog_ref(
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG>
-__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG const& group) noexcept
+__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG group) noexcept
 {
   impl_.clear(group);
 }
@@ -70,7 +70,7 @@ __host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add(InputIt first,
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
 __device__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge(
-  CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)
+  CG group, hyperloglog_ref<T, OtherScope, Hash> const& other)
 {
   impl_.merge(group, other.impl_);
 }
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ __global__ void insert(ref_type* set_refs)`
`80`	`80`	`{`
`81`	`81`	`namespace cg = cooperative_groups;`
`82`	`82`
`83`		`- auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());`
	`83`	`+ auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());`
`84`	`84`	`// Get subset (or CG) index`
`85`	`85`	`auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;`
`86`	`86`
`@@ -105,7 +105,7 @@ __global__ void find(ref_type* set_refs)`
`105`	`105`	`{`
`106`	`106`	`namespace cg = cooperative_groups;`
`107`	`107`
`108`		`- auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());`
	`108`	`+ auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());`
`109`	`109`	`auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;`
`110`	`110`
`111`	`111`	`auto raw_set_ref = *(set_refs + idx);`
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ class bloom_filter_impl {`
`95`	`95`	`}`
`96`	`96`
`97`	`97`	`template <class CG>`
`98`		`- __device__ constexpr void clear(CG const& group)`
	`98`	`+ __device__ constexpr void clear(CG group)`
`99`	`99`	`{`
`100`	`100`	`for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) {`
`101`	`101`	`words_[i] = 0;`
`@@ -149,7 +149,7 @@ class bloom_filter_impl {`
`149`	`149`	`}`
`150`	`150`
`151`	`151`	`template <class CG, class ProbeKey>`
`152`		`- __device__ void add(CG const& group, ProbeKey const& key)`
	`152`	`+ __device__ void add(CG group, ProbeKey const& key)`
`153`	`153`	`{`
`154`	`154`	`constexpr auto num_threads = tile_size_v<CG>;`
`155`	`155`	`constexpr auto optimal_num_threads = add_optimal_cg_size();`
`@@ -166,7 +166,7 @@ class bloom_filter_impl {`
`166`	`166`	`}`
`167`	`167`
`168`	`168`	`template <class CG, class InputIt>`
`169`		`- __device__ void add(CG const& group, InputIt first, InputIt last)`
	`169`	`+ __device__ void add(CG group, InputIt first, InputIt last)`
`170`	`170`	`{`
`171`	`171`	`namespace cg = cooperative_groups;`
`172`	`172`
`@@ -208,7 +208,7 @@ class bloom_filter_impl {`
`208`	`208`	`typename policy_type::hash_result_type hash_value;`
`209`	`209`	`size_type block_index;`
`210`	`210`
`211`		`- auto const worker_group = cg::tiled_partition<worker_num_threads>(group);`
	`211`	`+ auto const worker_group = cg::tiled_partition<worker_num_threads, CG>(group);`
`212`	`212`	`auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();`
`213`	`213`
`214`	`214`	`auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);`
`@@ -229,7 +229,7 @@ class bloom_filter_impl {`
`229`	`229`	`}`
`230`	`230`
`231`	`231`	`template <class CG, class HashValue, class BlockIndex>`
`232`		`- __device__ void add_impl(CG const& group, HashValue const& hash_value, BlockIndex block_index)`
	`232`	`+ __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index)`
`233`	`233`	`{`
`234`	`234`	`constexpr auto num_threads = tile_size_v<CG>;`
`235`	`235`
`@@ -327,7 +327,7 @@ class bloom_filter_impl {`
`327`	`327`	`}`
`328`	`328`
`329`	`329`	`template <class CG, class ProbeKey>`
`330`		`- [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const`
	`330`	`+ [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const`
`331`	`331`	`{`
`332`	`332`	`constexpr auto num_threads = tile_size_v<CG>;`
`333`	`333`	`constexpr auto optimal_num_threads = contains_optimal_cg_size();`
`@@ -359,7 +359,7 @@ class bloom_filter_impl {`
`359`	`359`
`360`	`360`	`// TODO`
`361`	`361`	`// template <class CG, class InputIt, class OutputIt>`
`362`		`- // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)`
	`362`	`+ // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)`
`363`	`363`	`// const;`
`364`	`364`
`365`	`365`	`template <class InputIt, class OutputIt>`
`@@ -432,7 +432,7 @@ class bloom_filter_impl {`
`432`	`432`	`// [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const`
`433`	`433`	`// [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)`
`434`	`434`	`// template <typename CG, cuda::thread_scope NewScope = thread_scope>`
`435`		`- // [[nodiscard]] __device__ constexpr auto make_copy(CG const& group, word_type* const`
	`435`	`+ // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const`
`436`	`436`	`// memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;`
`437`	`437`
`438`	`438`	`private:`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ __host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloo`
`39`	`39`
`40`	`40`	`template <class Key, class Extent, cuda::thread_scope Scope, class Policy>`
`41`	`41`	`template <class CG>`
`42`		`-__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG const& group)`
	`42`	`+__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG group)`
`43`	`43`	`{`
`44`	`44`	`impl_.clear(group);`
`45`	`45`	`}`
`@@ -66,15 +66,14 @@ __device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(ProbeKey const`
`66`	`66`
`67`	`67`	`template <class Key, class Extent, cuda::thread_scope Scope, class Policy>`
`68`	`68`	`template <class CG, class ProbeKey>`
`69`		`-__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,`
`70`		`- ProbeKey const& key)`
	`69`	`+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group, ProbeKey const& key)`
`71`	`70`	`{`
`72`	`71`	`impl_.add(group, key);`
`73`	`72`	`}`
`74`	`73`
`75`	`74`	`template <class Key, class Extent, cuda::thread_scope Scope, class Policy>`
`76`	`75`	`template <class CG, class InputIt>`
`77`		`-__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,`
	`76`	`+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group,`
`78`	`77`	`InputIt first,`
`79`	`78`	`InputIt last)`
`80`	`79`	`{`
`@@ -125,7 +124,7 @@ template <class ProbeKey>`
`125`	`124`	`template <class Key, class Extent, cuda::thread_scope Scope, class Policy>`
`126`	`125`	`template <class CG, class ProbeKey>`
`127`	`126`	`[[nodiscard]] __device__ bool bloom_filter_ref<Key, Extent, Scope, Policy>::contains(`
`128`		`- CG const& group, ProbeKey const& key) const`
	`127`	`+ CG group, ProbeKey const& key) const`
`129`	`128`	`{`
`130`	`129`	`return impl_.contains(group, key);`
`131`	`130`	`}`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ __host__ __device__ constexpr hyperloglog_ref<T, Scope, Hash>::hyperloglog_ref(`
`25`	`25`
`26`	`26`	`template <class T, cuda::thread_scope Scope, class Hash>`
`27`	`27`	`template <class CG>`
`28`		`-__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG const& group) noexcept`
	`28`	`+__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG group) noexcept`
`29`	`29`	`{`
`30`	`30`	`impl_.clear(group);`
`31`	`31`	`}`
`@@ -70,7 +70,7 @@ __host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add(InputIt first,`
`70`	`70`	`template <class T, cuda::thread_scope Scope, class Hash>`
`71`	`71`	`template <class CG, cuda::thread_scope OtherScope>`
`72`	`72`	`__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge(`
`73`		`- CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)`
	`73`	`+ CG group, hyperloglog_ref<T, OtherScope, Hash> const& other)`
`74`	`74`	`{`
`75`	`75`	`impl_.merge(group, other.impl_);`
`76`	`76`	`}`