Pass trivial types by value

PointKernel · PointKernel · commit 0cb59a67c34a · 2025-08-13T18:27:46.000-07:00
diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh
@@ -83,7 +83,7 @@ __host__ __device__ constexpr std::size_t alignment()
  * @return If the bits in the object representations of lhs and rhs are identical.
  */
 template <typename T>
-__host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs)
+__host__ __device__ constexpr bool bitwise_compare(T lhs, T rhs)
 {
   static_assert(
     cuco::is_bitwise_comparable_v<T>,
diff --git a/include/cuco/detail/open_addressing/functors.cuh b/include/cuco/detail/open_addressing/functors.cuh
@@ -73,7 +73,7 @@ struct slot_is_filled {
    * @param empty_sentinel Key sentinel indicating an empty slot
    * @param erased_sentinel Key sentinel indicating an erased slot
    */
-  explicit constexpr slot_is_filled(T const& empty_sentinel, T const& erased_sentinel) noexcept
+  explicit constexpr slot_is_filled(T empty_sentinel, T erased_sentinel) noexcept
     : empty_sentinel_{empty_sentinel}, erased_sentinel_{erased_sentinel}
   {
   }
@@ -88,7 +88,7 @@ struct slot_is_filled {
    * @return `true` if slot is filled
    */
   template <typename S>
-  __device__ constexpr bool operator()(S const& slot) const noexcept
+  __device__ constexpr bool operator()(S slot) const noexcept
   {
     auto const key = [&]() {
       if constexpr (HasPayload) {
diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
@@ -78,8 +78,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-        *(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
       if constexpr (CGSize == 1) {
         if (ref.insert(insert_element)) { thread_num_successes++; };
       } else {
@@ -137,8 +136,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-        *(first + idx)};
+      typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
       if constexpr (CGSize == 1) {
         ref.insert(insert_element);
       } else {
@@ -173,7 +171,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
+    typename cuda::std::iterator_traits<InputIt>::value_type const erase_element{*(first + idx)};
     if constexpr (CGSize == 1) {
       ref.erase(erase_element);
     } else {
@@ -213,7 +211,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
+    typename cuda::std::iterator_traits<InputIt>::value_type const key{*(first + idx)};
     if constexpr (CGSize == 1) {
       ref.for_each(key, callback_op);
     } else {
@@ -276,7 +274,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
          * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -290,7 +288,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize>(block);
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
         auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
         if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
       }
@@ -392,8 +390,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-        auto const found                                                    = ref.find(key);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
+        auto const found                                                   = ref.find(key);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
          * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -407,8 +405,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize>(block);
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-        auto const found                                                    = ref.find(tile, key);
+        typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
+        auto const found                                                   = ref.find(tile, key);
 
         if (tile.thread_rank() == 0) {
           *(output_begin + idx) = pred(*(stencil + idx)) ? output(found) : sentinel;
@@ -482,7 +480,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
   while ((idx - thread_idx / CGSize) < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
           *(first + idx)};
         auto const [iter, inserted] = ref.insert_and_find(insert_element);
         /*
@@ -502,7 +500,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
       if (idx < n) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
+        typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
           *(first + idx)};
         auto const [iter, inserted] = ref.insert_and_find(tile, insert_element);
         if (tile.thread_rank() == 0) {
@@ -553,7 +551,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+    typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
     if constexpr (CGSize == 1) {
       if constexpr (IsOuter) {
         thread_count += max(ref.count(key), outer_min_count);
@@ -612,7 +610,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count_each(InputIt first,
   size_type constexpr outer_min_count = 1;
 
   while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+    typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
     if constexpr (CGSize == 1) {
       if constexpr (IsOuter) {
         *(output_begin + idx) = max(ref.count(key), size_type{outer_min_count});
diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
@@ -372,7 +372,7 @@ class open_addressing_ref_impl {
    * @return True if the given element is successfully inserted
    */
   template <typename Value>
-  __device__ bool insert(Value const& value) noexcept
+  __device__ bool insert(Value value) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -427,7 +427,7 @@ class open_addressing_ref_impl {
    */
   template <bool SupportsErase, typename Value>
   __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
-                         Value const& value) noexcept
+                         Value value) noexcept
   {
     auto const val = this->heterogeneous_value(value);
     auto const key = this->extract_key(val);
@@ -512,7 +512,7 @@ class open_addressing_ref_impl {
    * insertion is successful or not.
    */
   template <typename Value>
-  __device__ cuda::std::pair<iterator, bool> insert_and_find(Value const& value) noexcept
+  __device__ cuda::std::pair<iterator, bool> insert_and_find(Value value) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 #if __CUDA_ARCH__ < 700
@@ -587,7 +587,7 @@ class open_addressing_ref_impl {
    */
   template <typename Value>
   __device__ cuda::std::pair<iterator, bool> insert_and_find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
+    cooperative_groups::thread_block_tile<cg_size> const& group, Value value) noexcept
   {
 #if __CUDA_ARCH__ < 700
     // Spinning to ensure that the write to the value part took place requires
@@ -678,12 +678,12 @@ class open_addressing_ref_impl {
    *
    * @tparam ProbeKey Input type which is convertible to 'key_type'
    *
-   * @param value The element to erase
+   * @param key The element to erase
    *
    * @return True if the given element is successfully erased
    */
   template <typename ProbeKey>
-  __device__ bool erase(ProbeKey const& key) noexcept
+  __device__ bool erase(ProbeKey key) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -723,13 +723,13 @@ class open_addressing_ref_impl {
    * @tparam ProbeKey Input type which is convertible to 'key_type'
    *
    * @param group The Cooperative Group used to perform group erase
-   * @param value The element to erase
+   * @param key The element to erase
    *
    * @return True if the given element is successfully erased
    */
   template <typename ProbeKey>
   __device__ bool erase(cooperative_groups::thread_block_tile<cg_size> const& group,
-                        ProbeKey const& key) noexcept
+                        ProbeKey key) noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -787,7 +787,7 @@ class open_addressing_ref_impl {
    * @return A boolean indicating whether the probe key is present
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ bool contains(ProbeKey key) const noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
     auto probing_iter =
@@ -826,7 +826,7 @@ class open_addressing_ref_impl {
    */
   template <typename ProbeKey>
   [[nodiscard]] __device__ bool contains(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -866,7 +866,7 @@ class open_addressing_ref_impl {
    * @return An iterator to the position at which the equivalent key is stored
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ iterator find(ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ iterator find(ProbeKey key) const noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
     auto probing_iter =
@@ -908,8 +908,8 @@ class open_addressing_ref_impl {
    * @return An iterator to the position at which the equivalent key is stored
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ iterator find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ iterator
+  find(cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -957,7 +957,7 @@ class open_addressing_ref_impl {
    * @return Number of occurrences found by the current thread
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ size_type count(ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ size_type count(ProbeKey key) const noexcept
   {
     if constexpr (not allows_duplicates) {
       return static_cast<size_type>(this->contains(key));
@@ -1004,8 +1004,8 @@ class open_addressing_ref_impl {
    * @return Number of occurrences found by the current thread
    */
   template <typename ProbeKey>
-  [[nodiscard]] __device__ size_type count(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  [[nodiscard]] __device__ size_type
+  count(cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -1360,7 +1360,7 @@ class open_addressing_ref_impl {
    * @param callback_op Function to apply to every matched slot
    */
   template <class ProbeKey, class CallbackOp>
-  __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept
+  __device__ void for_each(ProbeKey key, CallbackOp&& callback_op) const noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
     auto probing_iter =
@@ -1410,7 +1410,7 @@ class open_addressing_ref_impl {
    */
   template <class ProbeKey, class CallbackOp>
   __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
-                           ProbeKey const& key,
+                           ProbeKey key,
                            CallbackOp&& callback_op) const noexcept
   {
     auto probing_iter =
@@ -1474,7 +1474,7 @@ class open_addressing_ref_impl {
    */
   template <class ProbeKey, class CallbackOp, class SyncOp>
   __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
-                           ProbeKey const& key,
+                           ProbeKey key,
                            CallbackOp&& callback_op,
                            SyncOp&& sync_op) const noexcept
   {
@@ -1534,7 +1534,7 @@ class open_addressing_ref_impl {
    * @return The key
    */
   template <typename Value>
-  [[nodiscard]] __host__ __device__ constexpr auto extract_key(Value const& value) const noexcept
+  [[nodiscard]] __host__ __device__ constexpr auto extract_key(Value value) const noexcept
   {
     if constexpr (has_payload) {
       return thrust::raw_reference_cast(value).first;
@@ -1555,8 +1555,7 @@ class open_addressing_ref_impl {
    * @return The payload
    */
   template <typename Value, typename Enable = cuda::std::enable_if_t<has_payload and sizeof(Value)>>
-  [[nodiscard]] __host__ __device__ constexpr auto extract_payload(
-    Value const& value) const noexcept
+  [[nodiscard]] __host__ __device__ constexpr auto extract_payload(Value value) const noexcept
   {
     return thrust::raw_reference_cast(value).second;
   }
@@ -1571,7 +1570,7 @@ class open_addressing_ref_impl {
    * @return The converted object
    */
   template <typename T>
-  [[nodiscard]] __device__ constexpr value_type native_value(T const& value) const noexcept
+  [[nodiscard]] __device__ constexpr value_type native_value(T value) const noexcept
   {
     if constexpr (has_payload) {
       return {static_cast<key_type>(this->extract_key(value)), this->extract_payload(value)};
@@ -1591,7 +1590,7 @@ class open_addressing_ref_impl {
    * @return The converted object
    */
   template <typename T>
-  [[nodiscard]] __device__ constexpr auto heterogeneous_value(T const& value) const noexcept
+  [[nodiscard]] __device__ constexpr auto heterogeneous_value(T value) const noexcept
   {
     if constexpr (has_payload and not cuda::std::is_same_v<T, value_type>) {
       using mapped_type = decltype(this->empty_value_sentinel());
@@ -1613,7 +1612,7 @@ class open_addressing_ref_impl {
    *
    * @return The sentinel value used to represent an erased slot
    */
-  [[nodiscard]] __device__ constexpr value_type const erased_slot_sentinel() const noexcept
+  [[nodiscard]] __device__ constexpr value_type erased_slot_sentinel() const noexcept
   {
     if constexpr (has_payload) {
       return cuco::pair{this->erased_key_sentinel(), this->empty_value_sentinel()};
@@ -1674,8 +1673,8 @@ class open_addressing_ref_impl {
    */
   template <typename Value>
   [[nodiscard]] __device__ constexpr insert_result back_to_back_cas(value_type* address,
-                                                                    value_type const& expected,
-                                                                    Value const& desired) noexcept
+                                                                    value_type expected,
+                                                                    Value desired) noexcept
   {
     using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
 
@@ -1725,8 +1724,9 @@ class open_addressing_ref_impl {
    * @return Result of this operation, i.e., success/continue/duplicate
    */
   template <typename Value>
-  [[nodiscard]] __device__ constexpr insert_result cas_dependent_write(
-    value_type* address, value_type const& expected, Value const& desired) noexcept
+  [[nodiscard]] __device__ constexpr insert_result cas_dependent_write(value_type* address,
+                                                                       value_type expected,
+                                                                       Value desired) noexcept
   {
     using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
 
@@ -1767,8 +1767,8 @@ class open_addressing_ref_impl {
    */
   template <typename Value>
   [[nodiscard]] __device__ insert_result attempt_insert(value_type* address,
-                                                        value_type const& expected,
-                                                        Value const& desired) noexcept
+                                                        value_type expected,
+                                                        Value desired) noexcept
   {
     if constexpr (sizeof(value_type) <= 8) {
       return packed_cas(address, expected, desired);
@@ -1800,8 +1800,8 @@ class open_addressing_ref_impl {
    */
   template <typename Value>
   [[nodiscard]] __device__ insert_result attempt_insert_stable(value_type* address,
-                                                               value_type const& expected,
-                                                               Value const& desired) noexcept
+                                                               value_type expected,
+                                                               Value desired) noexcept
   {
     if constexpr (sizeof(value_type) <= 8) {
       return packed_cas(address, expected, desired);
@@ -1822,7 +1822,7 @@ class open_addressing_ref_impl {
    * @param sentinel The slot sentinel value
    */
   template <typename T>
-  __device__ void wait_for_payload(T& slot, T const& sentinel) const noexcept
+  __device__ void wait_for_payload(T& slot, T sentinel) const noexcept
   {
     auto ref = cuda::atomic_ref<T, Scope>{slot};
     T current;
diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl
diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh

Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ __host__ __device__ constexpr std::size_t alignment()`
`83`	`83`	`* @return If the bits in the object representations of lhs and rhs are identical.`
`84`	`84`	`*/`
`85`	`85`	`template <typename T>`
`86`		`-__host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs)`
	`86`	`+__host__ __device__ constexpr bool bitwise_compare(T lhs, T rhs)`
`87`	`87`	`{`
`88`	`88`	`static_assert(`
`89`	`89`	`cuco::is_bitwise_comparable_v<T>,`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ struct slot_is_filled {`
`73`	`73`	`* @param empty_sentinel Key sentinel indicating an empty slot`
`74`	`74`	`* @param erased_sentinel Key sentinel indicating an erased slot`
`75`	`75`	`*/`
`76`		`- explicit constexpr slot_is_filled(T const& empty_sentinel, T const& erased_sentinel) noexcept`
	`76`	`+ explicit constexpr slot_is_filled(T empty_sentinel, T erased_sentinel) noexcept`
`77`	`77`	`: empty_sentinel_{empty_sentinel}, erased_sentinel_{erased_sentinel}`
`78`	`78`	`{`
`79`	`79`	`}`
`@@ -88,7 +88,7 @@ struct slot_is_filled {`
`88`	`88`	* @return `true` if slot is filled
`89`	`89`	`*/`
`90`	`90`	`template <typename S>`
`91`		`- __device__ constexpr bool operator()(S const& slot) const noexcept`
	`91`	`+ __device__ constexpr bool operator()(S slot) const noexcept`
`92`	`92`	`{`
`93`	`93`	`auto const key = [&]() {`
`94`	`94`	`if constexpr (HasPayload) {`