@@ -490,15 +490,16 @@ public:
490490 }
491491
492492 /* *
493- * @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
493+ * @brief Subtracts the left element of each adjacent pair of elements
494+ * partitioned across a CUDA thread block.
494495 *
495496 * @par
496497 * - \rowmajor
497498 * - \smemreuse
498499 *
499500 * @par Snippet
500- * The code snippet below illustrates how to use @p BlockAdjacentDifference to
501- * compute the left difference between adjacent elements.
501+ * The code snippet below illustrates how to use @p BlockAdjacentDifference
502+ * to compute the left difference between adjacent elements.
502503 *
503504 * @par
504505 * @code
@@ -516,30 +517,152 @@ public:
516517 *
517518 * __global__ void ExampleKernel(...)
518519 * {
519- * // Specialize BlockAdjacentDifference for a 1D block of
520- * // 128 threads of type int
521- * using BlockAdjacentDifferenceT =
522- * cub::BlockAdjacentDifference<int, 128>;
520+ * // Specialize BlockAdjacentDifference for a 1D block of
521+ * // 128 threads of type int
522+ * using BlockAdjacentDifferenceT =
523+ * cub::BlockAdjacentDifference<int, 128>;
524+ *
525+ * // Allocate shared memory for BlockDiscontinuity
526+ * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
527+ *
528+ * // Obtain a segment of consecutive items that are blocked across threads
529+ * int thread_data[4];
530+ * ...
531+ * int valid_items = 9;
532+ *
533+ * // Collectively compute adjacent_difference
534+ * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
535+ * thread_data,
536+ * thread_data,
537+ * CustomDifference(),
538+ * valid_items);
523539 *
524- * // Allocate shared memory for BlockDiscontinuity
525- * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
540+ * @endcode
541+ * @par
542+ * Suppose the set of input `thread_data` across the block of threads is
543+ * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
544+ * The corresponding output `result` in those threads will be
545+ * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
526546 *
527- * // Obtain a segment of consecutive items that are blocked across threads
528- * int thread_data[4];
529- * ...
547+ * @param[out] output
548+ * Calling thread's adjacent difference result
530549 *
531- * // Collectively compute adjacent_difference
532- * BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
533- * thread_data,
534- * thread_data,
535- * CustomDifference());
550+ * @param[in] input
551+ * Calling thread's input items (may be aliased to \p output)
552+ *
553+ * @param[in] difference_op
554+ * Binary difference operator
555+ *
556+ * @param[in] valid_items
557+ * Number of valid items in thread block
558+ */
559+ template <int ITEMS_PER_THREAD,
560+ typename OutputType,
561+ typename DifferenceOpT>
562+ __device__ __forceinline__ void
563+ SubtractLeftPartialTile (T (&input)[ITEMS_PER_THREAD],
564+ OutputType (&output)[ITEMS_PER_THREAD],
565+ DifferenceOpT difference_op,
566+ int valid_items)
567+ {
568+ // Share last item
569+ temp_storage.last_items [linear_tid] = input[ITEMS_PER_THREAD - 1 ];
570+
571+ CTA_SYNC ();
572+
573+ if ((linear_tid + 1 ) * ITEMS_PER_THREAD <= valid_items)
574+ {
575+ #pragma unroll
576+ for (int item = ITEMS_PER_THREAD - 1 ; item > 0 ; item--)
577+ {
578+ output[item] = difference_op (input[item], input[item - 1 ]);
579+ }
580+ }
581+ else
582+ {
583+ #pragma unroll
584+ for (int item = ITEMS_PER_THREAD - 1 ; item > 0 ; item--)
585+ {
586+ const int idx = linear_tid * ITEMS_PER_THREAD + item;
587+
588+ if (idx < valid_items)
589+ {
590+ output[item] = difference_op (input[item], input[item - 1 ]);
591+ }
592+ else
593+ {
594+ output[item] = input[item];
595+ }
596+ }
597+ }
598+
599+ if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
600+ {
601+ output[0 ] = input[0 ];
602+ }
603+ else
604+ {
605+ output[0 ] = difference_op (input[0 ],
606+ temp_storage.last_items [linear_tid - 1 ]);
607+ }
608+ }
609+
610+ /* *
611+ * @brief Subtracts the left element of each adjacent pair of elements
612+ * partitioned across a CUDA thread block.
613+ *
614+ * @par
615+ * - \rowmajor
616+ * - \smemreuse
617+ *
618+ * @par Snippet
619+ * The code snippet below illustrates how to use @p BlockAdjacentDifference
620+ * to compute the left difference between adjacent elements.
621+ *
622+ * @par
623+ * @code
624+ * #include <cub/cub.cuh>
625+ * // or equivalently <cub/block/block_adjacent_difference.cuh>
626+ *
627+ * struct CustomDifference
628+ * {
629+ * template <typename DataType>
630+ * __device__ DataType operator()(DataType &lhs, DataType &rhs)
631+ * {
632+ * return lhs - rhs;
633+ * }
634+ * };
635+ *
636+ * __global__ void ExampleKernel(...)
637+ * {
638+ * // Specialize BlockAdjacentDifference for a 1D block of
639+ * // 128 threads of type int
640+ * using BlockAdjacentDifferenceT =
641+ * cub::BlockAdjacentDifference<int, 128>;
642+ *
643+ * // Allocate shared memory for BlockDiscontinuity
644+ * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
645+ *
646+ * // Obtain a segment of consecutive items that are blocked across threads
647+ * int thread_data[4];
648+ * ...
649+ * int valid_items = 9;
650+ * int tile_predecessor_item = 4;
651+ *
652+ * // Collectively compute adjacent_difference
653+ * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
654+ * thread_data,
655+ * thread_data,
656+ * CustomDifference(),
657+ * valid_items,
658+ * tile_predecessor_item);
536659 *
537660 * @endcode
538661 * @par
539662 * Suppose the set of input `thread_data` across the block of threads is
540663 * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
541664 * The corresponding output `result` in those threads will be
542- * `{ [4 ,-2,-1,0], [0,0,0,0], [1,1,0,0 ], [0,1,-3,3 ], ... }`.
665+ * `{ [0 ,-2,-1,0], [0,0,0,0], [1,3,3,3 ], [3,4,1,4 ], ... }`.
543666 *
544667 * @param[out] output
545668 * Calling thread's adjacent difference result
@@ -552,6 +675,11 @@ public:
552675 *
553676 * @param[in] valid_items
554677 * Number of valid items in thread block
678+ *
679+ * @param[in] tile_predecessor_item
680+ * **[<em>thread</em><sub>0</sub> only]** item which is going to be
681+ * subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
682+ * <em>thread</em><sub>0</sub>).
555683 */
556684 template <int ITEMS_PER_THREAD,
557685 typename OutputType,
@@ -560,7 +688,8 @@ public:
560688 SubtractLeftPartialTile (T (&input)[ITEMS_PER_THREAD],
561689 OutputType (&output)[ITEMS_PER_THREAD],
562690 DifferenceOpT difference_op,
563- int valid_items)
691+ int valid_items,
692+ T tile_predecessor_item)
564693 {
565694 // Share last item
566695 temp_storage.last_items [linear_tid] = input[ITEMS_PER_THREAD - 1 ];
@@ -593,10 +722,15 @@ public:
593722 }
594723 }
595724
596- if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
725+ if (valid_items <= linear_tid * ITEMS_PER_THREAD)
597726 {
598727 output[0 ] = input[0 ];
599728 }
729+ else if (linear_tid == 0 )
730+ {
731+ output[0 ] = difference_op (input[0 ],
732+ tile_predecessor_item);
733+ }
600734 else
601735 {
602736 output[0 ] = difference_op (input[0 ],
0 commit comments