Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit ec2e841

Browse files
authored
Merge pull request #529 from senior-zero/fix-main/github/adj_diff_copy_bp
Backport adjacent difference fixes
2 parents c64885b + 6dda1c2 commit ec2e841

File tree

5 files changed

+459
-441
lines changed

5 files changed

+459
-441
lines changed

cub/agent/agent_adjacent_difference.cuh

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,16 +143,40 @@ struct AgentDifference
143143
{
144144
if (IS_FIRST_TILE)
145145
{
146-
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
147-
.SubtractLeft(input, output, difference_op);
146+
if (IS_LAST_TILE)
147+
{
148+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
149+
.SubtractLeftPartialTile(input,
150+
output,
151+
difference_op,
152+
num_remaining);
153+
}
154+
else
155+
{
156+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
157+
.SubtractLeft(input, output, difference_op);
158+
}
148159
}
149160
else
150161
{
151-
InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx]
152-
: *(input_it + tile_base - 1);
153-
154-
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
155-
.SubtractLeft(input, output, difference_op, tile_prev_input);
162+
InputT tile_prev_input = MayAlias
163+
? first_tile_previous[tile_idx]
164+
: *(input_it + tile_base - 1);
165+
166+
if (IS_LAST_TILE)
167+
{
168+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
169+
.SubtractLeftPartialTile(input,
170+
output,
171+
difference_op,
172+
num_remaining,
173+
tile_prev_input);
174+
}
175+
else
176+
{
177+
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
178+
.SubtractLeft(input, output, difference_op, tile_prev_input);
179+
}
156180
}
157181
}
158182
else

cub/block/block_adjacent_difference.cuh

Lines changed: 154 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -490,15 +490,16 @@ public:
490490
}
491491

492492
/**
493-
* @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
493+
* @brief Subtracts the left element of each adjacent pair of elements
494+
* partitioned across a CUDA thread block.
494495
*
495496
* @par
496497
* - \rowmajor
497498
* - \smemreuse
498499
*
499500
* @par Snippet
500-
* The code snippet below illustrates how to use @p BlockAdjacentDifference to
501-
* compute the left difference between adjacent elements.
501+
* The code snippet below illustrates how to use @p BlockAdjacentDifference
502+
* to compute the left difference between adjacent elements.
502503
*
503504
* @par
504505
* @code
@@ -516,30 +517,152 @@ public:
516517
*
517518
* __global__ void ExampleKernel(...)
518519
* {
519-
* // Specialize BlockAdjacentDifference for a 1D block of
520-
* // 128 threads of type int
521-
* using BlockAdjacentDifferenceT =
522-
* cub::BlockAdjacentDifference<int, 128>;
520+
* // Specialize BlockAdjacentDifference for a 1D block of
521+
* // 128 threads of type int
522+
* using BlockAdjacentDifferenceT =
523+
* cub::BlockAdjacentDifference<int, 128>;
524+
*
525+
* // Allocate shared memory for BlockDiscontinuity
526+
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
527+
*
528+
* // Obtain a segment of consecutive items that are blocked across threads
529+
* int thread_data[4];
530+
* ...
531+
* int valid_items = 9;
532+
*
533+
* // Collectively compute adjacent_difference
534+
* BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
535+
* thread_data,
536+
* thread_data,
537+
* CustomDifference(),
538+
* valid_items);
523539
*
524-
* // Allocate shared memory for BlockDiscontinuity
525-
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
540+
* @endcode
541+
* @par
542+
* Suppose the set of input `thread_data` across the block of threads is
543+
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
544+
* The corresponding output `result` in those threads will be
545+
* `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
526546
*
527-
* // Obtain a segment of consecutive items that are blocked across threads
528-
* int thread_data[4];
529-
* ...
547+
* @param[out] output
548+
* Calling thread's adjacent difference result
530549
*
531-
* // Collectively compute adjacent_difference
532-
* BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
533-
* thread_data,
534-
* thread_data,
535-
* CustomDifference());
550+
* @param[in] input
551+
* Calling thread's input items (may be aliased to \p output)
552+
*
553+
* @param[in] difference_op
554+
* Binary difference operator
555+
*
556+
* @param[in] valid_items
557+
* Number of valid items in thread block
558+
*/
559+
template <int ITEMS_PER_THREAD,
560+
typename OutputType,
561+
typename DifferenceOpT>
562+
__device__ __forceinline__ void
563+
SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
564+
OutputType (&output)[ITEMS_PER_THREAD],
565+
DifferenceOpT difference_op,
566+
int valid_items)
567+
{
568+
// Share last item
569+
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
570+
571+
CTA_SYNC();
572+
573+
if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
574+
{
575+
#pragma unroll
576+
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
577+
{
578+
output[item] = difference_op(input[item], input[item - 1]);
579+
}
580+
}
581+
else
582+
{
583+
#pragma unroll
584+
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
585+
{
586+
const int idx = linear_tid * ITEMS_PER_THREAD + item;
587+
588+
if (idx < valid_items)
589+
{
590+
output[item] = difference_op(input[item], input[item - 1]);
591+
}
592+
else
593+
{
594+
output[item] = input[item];
595+
}
596+
}
597+
}
598+
599+
if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
600+
{
601+
output[0] = input[0];
602+
}
603+
else
604+
{
605+
output[0] = difference_op(input[0],
606+
temp_storage.last_items[linear_tid - 1]);
607+
}
608+
}
609+
610+
/**
611+
* @brief Subtracts the left element of each adjacent pair of elements
612+
* partitioned across a CUDA thread block.
613+
*
614+
* @par
615+
* - \rowmajor
616+
* - \smemreuse
617+
*
618+
* @par Snippet
619+
* The code snippet below illustrates how to use @p BlockAdjacentDifference
620+
* to compute the left difference between adjacent elements.
621+
*
622+
* @par
623+
* @code
624+
* #include <cub/cub.cuh>
625+
* // or equivalently <cub/block/block_adjacent_difference.cuh>
626+
*
627+
* struct CustomDifference
628+
* {
629+
* template <typename DataType>
630+
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
631+
* {
632+
* return lhs - rhs;
633+
* }
634+
* };
635+
*
636+
* __global__ void ExampleKernel(...)
637+
* {
638+
* // Specialize BlockAdjacentDifference for a 1D block of
639+
* // 128 threads of type int
640+
* using BlockAdjacentDifferenceT =
641+
* cub::BlockAdjacentDifference<int, 128>;
642+
*
643+
* // Allocate shared memory for BlockDiscontinuity
644+
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
645+
*
646+
* // Obtain a segment of consecutive items that are blocked across threads
647+
* int thread_data[4];
648+
* ...
649+
* int valid_items = 9;
650+
* int tile_predecessor_item = 4;
651+
*
652+
* // Collectively compute adjacent_difference
653+
* BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
654+
* thread_data,
655+
* thread_data,
656+
* CustomDifference(),
657+
* valid_items,
658+
* tile_predecessor_item);
536659
*
537660
* @endcode
538661
* @par
539662
* Suppose the set of input `thread_data` across the block of threads is
540663
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
541664
* The corresponding output `result` in those threads will be
542-
* `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
665+
* `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
543666
*
544667
* @param[out] output
545668
* Calling thread's adjacent difference result
@@ -552,6 +675,11 @@ public:
552675
*
553676
* @param[in] valid_items
554677
* Number of valid items in thread block
678+
*
679+
* @param[in] tile_predecessor_item
680+
* **[<em>thread</em><sub>0</sub> only]** item which is going to be
681+
* subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
682+
* <em>thread</em><sub>0</sub>).
555683
*/
556684
template <int ITEMS_PER_THREAD,
557685
typename OutputType,
@@ -560,7 +688,8 @@ public:
560688
SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
561689
OutputType (&output)[ITEMS_PER_THREAD],
562690
DifferenceOpT difference_op,
563-
int valid_items)
691+
int valid_items,
692+
T tile_predecessor_item)
564693
{
565694
// Share last item
566695
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
@@ -593,10 +722,15 @@ public:
593722
}
594723
}
595724

596-
if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
725+
if (valid_items <= linear_tid * ITEMS_PER_THREAD)
597726
{
598727
output[0] = input[0];
599728
}
729+
else if (linear_tid == 0)
730+
{
731+
output[0] = difference_op(input[0],
732+
tile_predecessor_item);
733+
}
600734
else
601735
{
602736
output[0] = difference_op(input[0],

cub/device/dispatch/dispatch_adjacent_difference.cuh

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -199,30 +199,27 @@ struct DispatchAdjacentDifference : public SelectedPolicy
199199
sizeof(InputT);
200200

201201
void *allocations[1] = {nullptr};
202-
std::size_t allocation_sizes[1] = {first_tile_previous_size};
202+
std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};
203203

204-
if (MayAlias)
204+
if (CubDebug(error = AliasTemporaries(d_temp_storage,
205+
temp_storage_bytes,
206+
allocations,
207+
allocation_sizes)))
205208
{
206-
if (CubDebug(error = AliasTemporaries(d_temp_storage,
207-
temp_storage_bytes,
208-
allocations,
209-
allocation_sizes)))
210-
{
211-
break;
212-
}
213-
214-
if (d_temp_storage == nullptr)
215-
{
216-
// Return if the caller is simply requesting the size of the storage
217-
// allocation
209+
break;
210+
}
218211

219-
if (temp_storage_bytes == 0)
220-
{
221-
temp_storage_bytes = 1;
222-
}
212+
if (d_temp_storage == nullptr)
213+
{
214+
// Return if the caller is simply requesting the size of the storage
215+
// allocation
223216

224-
break;
217+
if (temp_storage_bytes == 0)
218+
{
219+
temp_storage_bytes = 1;
225220
}
221+
222+
break;
226223
}
227224

228225
if (num_items == OffsetT{})

0 commit comments

Comments
 (0)