Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 6e2a232

Browse files
committed
- Fix for ambiguity in BlockScan::Reduce() between generic reduction and
summation. Summation entrypoints are now called ::Sum(), similar to the convention in BlockScan. - Small edits to mainpage documentation and download tracking - Refactor test and docs outside of cub directory. Take generated html docs off gitignore. Former-commit-id: 98c1a6b
1 parent dbf8dae commit 6e2a232

File tree

320 files changed

+19524
-480
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

320 files changed

+19524
-480
lines changed

VERSION.TXT

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,17 @@
1-
CUB version 0.900
1+
2+
0.9.1 03/09/2013
3+
4+
- Fix for ambiguity in BlockScan::Reduce() between generic reduction and
5+
summation. Summation entrypoints are now called ::Sum(), similar
6+
to the convention in BlockScan.
7+
8+
- Small edits to mainpage documentation and download tracking
9+
10+
//-----------------------------------------------------------------------------
11+
12+
0.9.0 03/07/2013
13+
14+
- Intial "preview" release. CUB is the first durable, high-performance library
15+
of cooperative block-level, warp-level, and thread-level primitives for CUDA
16+
kernel programming. More primitives and examples coming soon!
17+

cub/block/block_reduce.cuh

Lines changed: 58 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ namespace cub {
112112
* ...
113113
*
114114
* // Compute the threadblock-wide sum for thread0
115-
* int aggregate = BlockReduce::Reduce(smem_storage, data);
115+
* int aggregate = BlockReduce::Sum(smem_storage, data);
116116
*
117117
* ...
118118
* \endcode
@@ -137,7 +137,7 @@ namespace cub {
137137
* if (threadIdx.x < num_elements) data = ...;
138138
*
139139
* // Compute the threadblock-wide sum of valid elements in thread0
140-
* int aggregate = BlockReduce::Reduce(smem_storage, data, num_elements);
140+
* int aggregate = BlockReduce::Sum(smem_storage, data, num_elements);
141141
*
142142
* ...
143143
* \endcode
@@ -296,63 +296,7 @@ private:
296296

297297
public:
298298

299-
/******************************************************************//**
300-
* \name Summation reductions
301-
*********************************************************************/
302-
//@{
303-
304-
/**
305-
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
306-
*
307-
* The return value is undefined in threads other than thread<sub>0</sub>.
308-
*
309-
* \smemreuse
310-
*/
311-
static __device__ __forceinline__ T Reduce(
312-
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
313-
T input) ///< [in] Calling thread's input
314-
{
315-
Sum<T> reduction_op;
316-
return Reduce(smem_storage, input, reduction_op);
317-
}
318-
319-
/**
320-
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
321-
*
322-
* The return value is undefined in threads other than thread<sub>0</sub>.
323-
*
324-
* \smemreuse
325-
*
326-
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
327-
*/
328-
template <int ITEMS_PER_THREAD>
329-
static __device__ __forceinline__ T Reduce(
330-
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
331-
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
332-
{
333-
Sum<T> reduction_op;
334-
return Reduce(smem_storage, inputs, reduction_op);
335-
}
336-
337-
338-
/**
339-
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p valid_threads threads each contribute one input element.
340-
*
341-
* \smemreuse
342-
*
343-
* The return value is undefined in threads other than thread<sub>0</sub>.
344-
*/
345-
static __device__ __forceinline__ T Reduce(
346-
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
347-
T input, ///< [in] Calling thread's input
348-
const unsigned int &valid_threads) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
349-
{
350-
Sum<T> reduction_op;
351-
return Reduce(smem_storage, input, valid_threads);
352-
}
353-
354299

355-
//@}
356300
/******************************************************************//**
357301
* \name Generic reductions
358302
*********************************************************************/
@@ -430,7 +374,63 @@ public:
430374
}
431375

432376
//@}
377+
/******************************************************************//**
378+
* \name Summation reductions
379+
*********************************************************************/
380+
//@{
381+
382+
/**
383+
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
384+
*
385+
* The return value is undefined in threads other than thread<sub>0</sub>.
386+
*
387+
* \smemreuse
388+
*/
389+
static __device__ __forceinline__ T Sum(
390+
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
391+
T input) ///< [in] Calling thread's input
392+
{
393+
cub::Sum<T> reduction_op;
394+
return Reduce(smem_storage, input, reduction_op);
395+
}
396+
397+
/**
398+
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
399+
*
400+
* The return value is undefined in threads other than thread<sub>0</sub>.
401+
*
402+
* \smemreuse
403+
*
404+
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
405+
*/
406+
template <int ITEMS_PER_THREAD>
407+
static __device__ __forceinline__ T Sum(
408+
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
409+
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
410+
{
411+
cub::Sum<T> reduction_op;
412+
return Reduce(smem_storage, inputs, reduction_op);
413+
}
414+
433415

416+
/**
417+
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p valid_threads threads each contribute one input element.
418+
*
419+
* \smemreuse
420+
*
421+
* The return value is undefined in threads other than thread<sub>0</sub>.
422+
*/
423+
static __device__ __forceinline__ T Sum(
424+
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
425+
T input, ///< [in] Calling thread's input
426+
const unsigned int &valid_threads) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
427+
{
428+
cub::Sum<T> reduction_op;
429+
return Reduce(smem_storage, input, reduction_op, valid_threads);
430+
}
431+
432+
433+
//@}
434434
};
435435

436436
/** @} */ // end of SimtCoop group

0 commit comments

Comments
 (0)