Skip to content

Commit 0cb59a6

Browse files
committed
Pass trivial types by value
1 parent d091dd6 commit 0cb59a6

File tree

6 files changed

+59
-61
lines changed

6 files changed

+59
-61
lines changed

include/cuco/detail/bitwise_compare.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ __host__ __device__ constexpr std::size_t alignment()
8383
* @return If the bits in the object representations of lhs and rhs are identical.
8484
*/
8585
template <typename T>
86-
__host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs)
86+
__host__ __device__ constexpr bool bitwise_compare(T lhs, T rhs)
8787
{
8888
static_assert(
8989
cuco::is_bitwise_comparable_v<T>,

include/cuco/detail/open_addressing/functors.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ struct slot_is_filled {
7373
* @param empty_sentinel Key sentinel indicating an empty slot
7474
* @param erased_sentinel Key sentinel indicating an erased slot
7575
*/
76-
explicit constexpr slot_is_filled(T const& empty_sentinel, T const& erased_sentinel) noexcept
76+
explicit constexpr slot_is_filled(T empty_sentinel, T erased_sentinel) noexcept
7777
: empty_sentinel_{empty_sentinel}, erased_sentinel_{erased_sentinel}
7878
{
7979
}
@@ -88,7 +88,7 @@ struct slot_is_filled {
8888
* @return `true` if slot is filled
8989
*/
9090
template <typename S>
91-
__device__ constexpr bool operator()(S const& slot) const noexcept
91+
__device__ constexpr bool operator()(S slot) const noexcept
9292
{
9393
auto const key = [&]() {
9494
if constexpr (HasPayload) {

include/cuco/detail/open_addressing/kernels.cuh

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
7878

7979
while (idx < n) {
8080
if (pred(*(stencil + idx))) {
81-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
82-
*(first + idx)};
81+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
8382
if constexpr (CGSize == 1) {
8483
if (ref.insert(insert_element)) { thread_num_successes++; };
8584
} else {
@@ -137,8 +136,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
137136

138137
while (idx < n) {
139138
if (pred(*(stencil + idx))) {
140-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
141-
*(first + idx)};
139+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
142140
if constexpr (CGSize == 1) {
143141
ref.insert(insert_element);
144142
} else {
@@ -173,7 +171,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
173171
auto idx = cuco::detail::global_thread_id() / CGSize;
174172

175173
while (idx < n) {
176-
typename cuda::std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
174+
typename cuda::std::iterator_traits<InputIt>::value_type const erase_element{*(first + idx)};
177175
if constexpr (CGSize == 1) {
178176
ref.erase(erase_element);
179177
} else {
@@ -213,7 +211,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
213211
auto idx = cuco::detail::global_thread_id() / CGSize;
214212

215213
while (idx < n) {
216-
typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
214+
typename cuda::std::iterator_traits<InputIt>::value_type const key{*(first + idx)};
217215
if constexpr (CGSize == 1) {
218216
ref.for_each(key, callback_op);
219217
} else {
@@ -276,7 +274,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
276274
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
277275
if constexpr (CGSize == 1) {
278276
if (idx < n) {
279-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
277+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
280278
/*
281279
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
282280
* sector stores from L2 to global memory. By writing results to shared memory and then
@@ -290,7 +288,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
290288
} else {
291289
auto const tile = cg::tiled_partition<CGSize>(block);
292290
if (idx < n) {
293-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
291+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
294292
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
295293
if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
296294
}
@@ -392,8 +390,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
392390
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
393391
if constexpr (CGSize == 1) {
394392
if (idx < n) {
395-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
396-
auto const found = ref.find(key);
393+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
394+
auto const found = ref.find(key);
397395
/*
398396
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
399397
* sector stores from L2 to global memory. By writing results to shared memory and then
@@ -407,8 +405,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
407405
} else {
408406
auto const tile = cg::tiled_partition<CGSize>(block);
409407
if (idx < n) {
410-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
411-
auto const found = ref.find(tile, key);
408+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
409+
auto const found = ref.find(tile, key);
412410

413411
if (tile.thread_rank() == 0) {
414412
*(output_begin + idx) = pred(*(stencil + idx)) ? output(found) : sentinel;
@@ -482,7 +480,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
482480
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
483481
if constexpr (CGSize == 1) {
484482
if (idx < n) {
485-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
483+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
486484
*(first + idx)};
487485
auto const [iter, inserted] = ref.insert_and_find(insert_element);
488486
/*
@@ -502,7 +500,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
502500
} else {
503501
auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
504502
if (idx < n) {
505-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
503+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
506504
*(first + idx)};
507505
auto const [iter, inserted] = ref.insert_and_find(tile, insert_element);
508506
if (tile.thread_rank() == 0) {
@@ -553,7 +551,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
553551
auto idx = cuco::detail::global_thread_id() / CGSize;
554552

555553
while (idx < n) {
556-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
554+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
557555
if constexpr (CGSize == 1) {
558556
if constexpr (IsOuter) {
559557
thread_count += max(ref.count(key), outer_min_count);
@@ -612,7 +610,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count_each(InputIt first,
612610
size_type constexpr outer_min_count = 1;
613611

614612
while (idx < n) {
615-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
613+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
616614
if constexpr (CGSize == 1) {
617615
if constexpr (IsOuter) {
618616
*(output_begin + idx) = max(ref.count(key), size_type{outer_min_count});

include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ class open_addressing_ref_impl {
372372
* @return True if the given element is successfully inserted
373373
*/
374374
template <typename Value>
375-
__device__ bool insert(Value const& value) noexcept
375+
__device__ bool insert(Value value) noexcept
376376
{
377377
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
378378

@@ -427,7 +427,7 @@ class open_addressing_ref_impl {
427427
*/
428428
template <bool SupportsErase, typename Value>
429429
__device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
430-
Value const& value) noexcept
430+
Value value) noexcept
431431
{
432432
auto const val = this->heterogeneous_value(value);
433433
auto const key = this->extract_key(val);
@@ -512,7 +512,7 @@ class open_addressing_ref_impl {
512512
* insertion is successful or not.
513513
*/
514514
template <typename Value>
515-
__device__ cuda::std::pair<iterator, bool> insert_and_find(Value const& value) noexcept
515+
__device__ cuda::std::pair<iterator, bool> insert_and_find(Value value) noexcept
516516
{
517517
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
518518
#if __CUDA_ARCH__ < 700
@@ -587,7 +587,7 @@ class open_addressing_ref_impl {
587587
*/
588588
template <typename Value>
589589
__device__ cuda::std::pair<iterator, bool> insert_and_find(
590-
cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
590+
cooperative_groups::thread_block_tile<cg_size> const& group, Value value) noexcept
591591
{
592592
#if __CUDA_ARCH__ < 700
593593
// Spinning to ensure that the write to the value part took place requires
@@ -678,12 +678,12 @@ class open_addressing_ref_impl {
678678
*
679679
* @tparam ProbeKey Input type which is convertible to 'key_type'
680680
*
681-
* @param value The element to erase
681+
* @param key The element to erase
682682
*
683683
* @return True if the given element is successfully erased
684684
*/
685685
template <typename ProbeKey>
686-
__device__ bool erase(ProbeKey const& key) noexcept
686+
__device__ bool erase(ProbeKey key) noexcept
687687
{
688688
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
689689

@@ -723,13 +723,13 @@ class open_addressing_ref_impl {
723723
* @tparam ProbeKey Input type which is convertible to 'key_type'
724724
*
725725
* @param group The Cooperative Group used to perform group erase
726-
* @param value The element to erase
726+
* @param key The element to erase
727727
*
728728
* @return True if the given element is successfully erased
729729
*/
730730
template <typename ProbeKey>
731731
__device__ bool erase(cooperative_groups::thread_block_tile<cg_size> const& group,
732-
ProbeKey const& key) noexcept
732+
ProbeKey key) noexcept
733733
{
734734
auto probing_iter =
735735
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -787,7 +787,7 @@ class open_addressing_ref_impl {
787787
* @return A boolean indicating whether the probe key is present
788788
*/
789789
template <typename ProbeKey>
790-
[[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept
790+
[[nodiscard]] __device__ bool contains(ProbeKey key) const noexcept
791791
{
792792
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
793793
auto probing_iter =
@@ -826,7 +826,7 @@ class open_addressing_ref_impl {
826826
*/
827827
template <typename ProbeKey>
828828
[[nodiscard]] __device__ bool contains(
829-
cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
829+
cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey key) const noexcept
830830
{
831831
auto probing_iter =
832832
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -866,7 +866,7 @@ class open_addressing_ref_impl {
866866
* @return An iterator to the position at which the equivalent key is stored
867867
*/
868868
template <typename ProbeKey>
869-
[[nodiscard]] __device__ iterator find(ProbeKey const& key) const noexcept
869+
[[nodiscard]] __device__ iterator find(ProbeKey key) const noexcept
870870
{
871871
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
872872
auto probing_iter =
@@ -908,8 +908,8 @@ class open_addressing_ref_impl {
908908
* @return An iterator to the position at which the equivalent key is stored
909909
*/
910910
template <typename ProbeKey>
911-
[[nodiscard]] __device__ iterator find(
912-
cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
911+
[[nodiscard]] __device__ iterator
912+
find(cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey key) const noexcept
913913
{
914914
auto probing_iter =
915915
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -957,7 +957,7 @@ class open_addressing_ref_impl {
957957
* @return Number of occurrences found by the current thread
958958
*/
959959
template <typename ProbeKey>
960-
[[nodiscard]] __device__ size_type count(ProbeKey const& key) const noexcept
960+
[[nodiscard]] __device__ size_type count(ProbeKey key) const noexcept
961961
{
962962
if constexpr (not allows_duplicates) {
963963
return static_cast<size_type>(this->contains(key));
@@ -1004,8 +1004,8 @@ class open_addressing_ref_impl {
10041004
* @return Number of occurrences found by the current thread
10051005
*/
10061006
template <typename ProbeKey>
1007-
[[nodiscard]] __device__ size_type count(
1008-
cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
1007+
[[nodiscard]] __device__ size_type
1008+
count(cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey key) const noexcept
10091009
{
10101010
auto probing_iter =
10111011
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -1360,7 +1360,7 @@ class open_addressing_ref_impl {
13601360
* @param callback_op Function to apply to every matched slot
13611361
*/
13621362
template <class ProbeKey, class CallbackOp>
1363-
__device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept
1363+
__device__ void for_each(ProbeKey key, CallbackOp&& callback_op) const noexcept
13641364
{
13651365
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
13661366
auto probing_iter =
@@ -1410,7 +1410,7 @@ class open_addressing_ref_impl {
14101410
*/
14111411
template <class ProbeKey, class CallbackOp>
14121412
__device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
1413-
ProbeKey const& key,
1413+
ProbeKey key,
14141414
CallbackOp&& callback_op) const noexcept
14151415
{
14161416
auto probing_iter =
@@ -1474,7 +1474,7 @@ class open_addressing_ref_impl {
14741474
*/
14751475
template <class ProbeKey, class CallbackOp, class SyncOp>
14761476
__device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
1477-
ProbeKey const& key,
1477+
ProbeKey key,
14781478
CallbackOp&& callback_op,
14791479
SyncOp&& sync_op) const noexcept
14801480
{
@@ -1534,7 +1534,7 @@ class open_addressing_ref_impl {
15341534
* @return The key
15351535
*/
15361536
template <typename Value>
1537-
[[nodiscard]] __host__ __device__ constexpr auto extract_key(Value const& value) const noexcept
1537+
[[nodiscard]] __host__ __device__ constexpr auto extract_key(Value value) const noexcept
15381538
{
15391539
if constexpr (has_payload) {
15401540
return thrust::raw_reference_cast(value).first;
@@ -1555,8 +1555,7 @@ class open_addressing_ref_impl {
15551555
* @return The payload
15561556
*/
15571557
template <typename Value, typename Enable = cuda::std::enable_if_t<has_payload and sizeof(Value)>>
1558-
[[nodiscard]] __host__ __device__ constexpr auto extract_payload(
1559-
Value const& value) const noexcept
1558+
[[nodiscard]] __host__ __device__ constexpr auto extract_payload(Value value) const noexcept
15601559
{
15611560
return thrust::raw_reference_cast(value).second;
15621561
}
@@ -1571,7 +1570,7 @@ class open_addressing_ref_impl {
15711570
* @return The converted object
15721571
*/
15731572
template <typename T>
1574-
[[nodiscard]] __device__ constexpr value_type native_value(T const& value) const noexcept
1573+
[[nodiscard]] __device__ constexpr value_type native_value(T value) const noexcept
15751574
{
15761575
if constexpr (has_payload) {
15771576
return {static_cast<key_type>(this->extract_key(value)), this->extract_payload(value)};
@@ -1591,7 +1590,7 @@ class open_addressing_ref_impl {
15911590
* @return The converted object
15921591
*/
15931592
template <typename T>
1594-
[[nodiscard]] __device__ constexpr auto heterogeneous_value(T const& value) const noexcept
1593+
[[nodiscard]] __device__ constexpr auto heterogeneous_value(T value) const noexcept
15951594
{
15961595
if constexpr (has_payload and not cuda::std::is_same_v<T, value_type>) {
15971596
using mapped_type = decltype(this->empty_value_sentinel());
@@ -1613,7 +1612,7 @@ class open_addressing_ref_impl {
16131612
*
16141613
* @return The sentinel value used to represent an erased slot
16151614
*/
1616-
[[nodiscard]] __device__ constexpr value_type const erased_slot_sentinel() const noexcept
1615+
[[nodiscard]] __device__ constexpr value_type erased_slot_sentinel() const noexcept
16171616
{
16181617
if constexpr (has_payload) {
16191618
return cuco::pair{this->erased_key_sentinel(), this->empty_value_sentinel()};
@@ -1674,8 +1673,8 @@ class open_addressing_ref_impl {
16741673
*/
16751674
template <typename Value>
16761675
[[nodiscard]] __device__ constexpr insert_result back_to_back_cas(value_type* address,
1677-
value_type const& expected,
1678-
Value const& desired) noexcept
1676+
value_type expected,
1677+
Value desired) noexcept
16791678
{
16801679
using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
16811680

@@ -1725,8 +1724,9 @@ class open_addressing_ref_impl {
17251724
* @return Result of this operation, i.e., success/continue/duplicate
17261725
*/
17271726
template <typename Value>
1728-
[[nodiscard]] __device__ constexpr insert_result cas_dependent_write(
1729-
value_type* address, value_type const& expected, Value const& desired) noexcept
1727+
[[nodiscard]] __device__ constexpr insert_result cas_dependent_write(value_type* address,
1728+
value_type expected,
1729+
Value desired) noexcept
17301730
{
17311731
using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
17321732

@@ -1767,8 +1767,8 @@ class open_addressing_ref_impl {
17671767
*/
17681768
template <typename Value>
17691769
[[nodiscard]] __device__ insert_result attempt_insert(value_type* address,
1770-
value_type const& expected,
1771-
Value const& desired) noexcept
1770+
value_type expected,
1771+
Value desired) noexcept
17721772
{
17731773
if constexpr (sizeof(value_type) <= 8) {
17741774
return packed_cas(address, expected, desired);
@@ -1800,8 +1800,8 @@ class open_addressing_ref_impl {
18001800
*/
18011801
template <typename Value>
18021802
[[nodiscard]] __device__ insert_result attempt_insert_stable(value_type* address,
1803-
value_type const& expected,
1804-
Value const& desired) noexcept
1803+
value_type expected,
1804+
Value desired) noexcept
18051805
{
18061806
if constexpr (sizeof(value_type) <= 8) {
18071807
return packed_cas(address, expected, desired);
@@ -1822,7 +1822,7 @@ class open_addressing_ref_impl {
18221822
* @param sentinel The slot sentinel value
18231823
*/
18241824
template <typename T>
1825-
__device__ void wait_for_payload(T& slot, T const& sentinel) const noexcept
1825+
__device__ void wait_for_payload(T& slot, T sentinel) const noexcept
18261826
{
18271827
auto ref = cuda::atomic_ref<T, Scope>{slot};
18281828
T current;

0 commit comments

Comments
 (0)