Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit b5fbd0e

Browse files
committed
Merge pull request #14 from dumerrill/master
Compilation and bug fixes, improved histo performance Former-commit-id: 5603fbf
2 parents 8363afc + b2fd037 commit b5fbd0e

File tree

198 files changed

+3832
-2948
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

198 files changed

+3832
-2948
lines changed

CHANGE_LOG.TXT

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
//-----------------------------------------------------------------------------
22

3-
0.9.3 04/30/2013
3+
0.9.4 05/07/2013
44

5+
- Fixed compilation errors for SM10-SM13
6+
- Fixed compilation errors for some WarpScan entrypoints on SM30+
7+
- Added block-wide histogram (BlockHisto256)
8+
- Added device-wide histogram (DeviceHisto256)
59
- Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which
610
trades more register consumption for less shared memory I/O)
7-
- Added block-wide histogram (BlockHisto256)
811
- Updates to BlockRadixRank to use BlockScan (which improves performance
912
on Kepler due to SHFL instruction)
10-
- Added device-wide histogram (DeviceHisto256)
11-
- Fixed compilation errors for some WarpScan entrypoints on SM30+
1213
- Allow types other than C++ primitives to be used in WarpScan::*Sum methods
1314
if they only have operator + overloaded. (Previously they also required
1415
to support assignment from int(0).)

cub/block/block_histo_256.cuh

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,12 @@ CUB_NS_PREFIX
4545
namespace cub {
4646

4747

48+
/******************************************************************************
49+
* Algorithmic variants
50+
******************************************************************************/
51+
4852
/**
49-
* BlockHisto256Algorithm enumerates alternative algorithms for the parallel
50-
* construction of 8b histograms.
53+
* \brief BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.
5154
*/
5255
enum BlockHisto256Algorithm
5356
{
@@ -57,21 +60,33 @@ enum BlockHisto256Algorithm
5760
* Sorting followed by differentiation. Execution is comprised of two phases:
5861
* -# Sort the 8b data using efficient radix sort
5962
* -# Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts.
63+
*
64+
* \par Performance Considerations
65+
* Delivers consistent throughput regardless of sample bin distribution.
6066
*/
61-
BLOCK_BYTE_HISTO_SORT,
67+
BLOCK_HISTO_256_SORT,
6268

6369

6470
/**
6571
* \par Overview
6672
* Use atomic addition to update byte counts directly
6773
*
68-
* \par Usage Considerations
69-
* BLOCK_BYTE_HISTO_ATOMIC can only be used on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used regardless.
74+
* \par Performance Considerations
75+
* Performance is strongly tied to the hardware implementation of atomic
76+
* addition, and may be significantly degraded for non uniformly-random
77+
* input distributions where many concurrent updates are likely to be
78+
* made to the same bin counter.
7079
*/
71-
BLOCK_BYTE_HISTO_ATOMIC,
80+
BLOCK_HISTO_256_ATOMIC,
7281
};
7382

7483

84+
85+
/******************************************************************************
86+
* Block histogram
87+
******************************************************************************/
88+
89+
7590
/**
7691
* \addtogroup BlockModule
7792
* @{
@@ -90,12 +105,12 @@ enum BlockHisto256Algorithm
90105
*
91106
* \tparam BLOCK_THREADS The threadblock size in threads
92107
* \tparam ITEMS_PER_THREAD The number of items per thread
93-
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_BYTE_HISTO_SORT)
108+
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_HISTO_256_SORT)
94109
*
95110
* \par Algorithm
96111
* BlockHisto256 can be (optionally) configured to use different algorithms:
97-
* -# <b>cub::BLOCK_BYTE_HISTO_SORT</b>. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
98-
* -# <b>cub::BLOCK_BYTE_HISTO_ATOMIC</b>. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
112+
* -# <b>cub::BLOCK_HISTO_256_SORT</b>. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
113+
* -# <b>cub::BLOCK_HISTO_256_ATOMIC</b>. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
99114
*
100115
* \par Usage Considerations
101116
* - The histogram output can be constructed in shared or global memory
@@ -167,7 +182,7 @@ enum BlockHisto256Algorithm
167182
template <
168183
int BLOCK_THREADS,
169184
int ITEMS_PER_THREAD,
170-
BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT>
185+
BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT>
171186
class BlockHisto256
172187
{
173188
private:
@@ -178,13 +193,13 @@ private:
178193

179194
/**
180195
* Ensure the template parameterization meets the requirements of the
181-
* targeted device architecture. BLOCK_BYTE_HISTO_ATOMIC can only be used
182-
* on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used
196+
* targeted device architecture. BLOCK_HISTO_256_ATOMIC can only be used
197+
* on version SM120 or later. Otherwise BLOCK_HISTO_256_SORT is used
183198
* regardless.
184199
*/
185200
static const BlockHisto256Algorithm SAFE_ALGORITHM =
186-
((ALGORITHM == BLOCK_BYTE_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
187-
BLOCK_BYTE_HISTO_SORT :
201+
((ALGORITHM == BLOCK_HISTO_256_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
202+
BLOCK_HISTO_256_SORT :
188203
ALGORITHM;
189204

190205
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -195,7 +210,7 @@ private:
195210
******************************************************************************/
196211

197212
/**
198-
* BLOCK_BYTE_HISTO_SORT algorithmic variant
213+
* BLOCK_HISTO_256_SORT algorithmic variant
199214
*/
200215
template <BlockHisto256Algorithm _ALGORITHM, int DUMMY = 0>
201216
struct BlockHisto256Internal
@@ -319,10 +334,10 @@ private:
319334

320335

321336
/**
322-
* BLOCK_BYTE_HISTO_ATOMIC algorithmic variant
337+
* BLOCK_HISTO_256_ATOMIC algorithmic variant
323338
*/
324339
template <int DUMMY>
325-
struct BlockHisto256Internal<BLOCK_BYTE_HISTO_ATOMIC, DUMMY>
340+
struct BlockHisto256Internal<BLOCK_HISTO_256_ATOMIC, DUMMY>
326341
{
327342
/// Shared memory storage layout type
328343
struct SmemStorage {};

cub/block/block_load.cuh

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ template <
7777
int ITEMS_PER_THREAD,
7878
typename InputIteratorRA>
7979
__device__ __forceinline__ void BlockLoadDirect(
80-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
81-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
80+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
81+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
8282
{
8383
// Load directly in thread-blocked order
8484
#pragma unroll
@@ -106,8 +106,8 @@ template <
106106
int ITEMS_PER_THREAD,
107107
typename InputIteratorRA>
108108
__device__ __forceinline__ void BlockLoadDirect(
109-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
110-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
109+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
110+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
111111
{
112112
BlockLoadDirect<PTX_LOAD_NONE>(block_itr, items);
113113
}
@@ -132,9 +132,9 @@ template <
132132
int ITEMS_PER_THREAD,
133133
typename InputIteratorRA>
134134
__device__ __forceinline__ void BlockLoadDirect(
135-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
136-
const int &guarded_items, ///< [in] Number of valid items in the tile
137-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
135+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
136+
const int &guarded_items, ///< [in] Number of valid items in the tile
137+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
138138
{
139139
int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
140140

@@ -165,9 +165,9 @@ template <
165165
int ITEMS_PER_THREAD,
166166
typename InputIteratorRA>
167167
__device__ __forceinline__ void BlockLoadDirect(
168-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
169-
const int &guarded_items, ///< [in] Number of valid items in the tile
170-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
168+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
169+
const int &guarded_items, ///< [in] Number of valid items in the tile
170+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
171171
{
172172
BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, items);
173173
}
@@ -191,10 +191,10 @@ template <
191191
int ITEMS_PER_THREAD,
192192
typename InputIteratorRA>
193193
__device__ __forceinline__ void BlockLoadDirect(
194-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
195-
const int &guarded_items, ///< [in] Number of valid items in the tile
196-
T oob_default, ///< [in] Default value to assign out-of-bound items
197-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
194+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
195+
const int &guarded_items, ///< [in] Number of valid items in the tile
196+
T oob_default, ///< [in] Default value to assign out-of-bound items
197+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
198198
{
199199
int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
200200

@@ -224,10 +224,10 @@ template <
224224
int ITEMS_PER_THREAD,
225225
typename InputIteratorRA>
226226
__device__ __forceinline__ void BlockLoadDirect(
227-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
228-
const int &guarded_items, ///< [in] Number of valid items in the tile
229-
T oob_default, ///< [in] Default value to assign out-of-bound items
230-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
227+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
228+
const int &guarded_items, ///< [in] Number of valid items in the tile
229+
T oob_default, ///< [in] Default value to assign out-of-bound items
230+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
231231
{
232232
BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items);
233233
}
@@ -348,10 +348,10 @@ template <
348348
int ITEMS_PER_THREAD,
349349
typename InputIteratorRA>
350350
__device__ __forceinline__ void BlockLoadDirectStriped(
351-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
352-
const int &guarded_items, ///< [in] Number of valid items in the tile
353-
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
354-
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
351+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
352+
const int &guarded_items, ///< [in] Number of valid items in the tile
353+
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
354+
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
355355
{
356356
BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, items, stride);
357357
}
@@ -409,11 +409,11 @@ template <
409409
int ITEMS_PER_THREAD,
410410
typename InputIteratorRA>
411411
__device__ __forceinline__ void BlockLoadDirectStriped(
412-
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
413-
const int &guarded_items, ///< [in] Number of valid items in the tile
414-
T oob_default, ///< [in] Default value to assign out-of-bound items
415-
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
416-
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
412+
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
413+
const int &guarded_items, ///< [in] Number of valid items in the tile
414+
T oob_default, ///< [in] Default value to assign out-of-bound items
415+
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
416+
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
417417
{
418418
BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items, stride);
419419
}
@@ -446,8 +446,8 @@ template <
446446
typename T,
447447
int ITEMS_PER_THREAD>
448448
__device__ __forceinline__ void BlockLoadVectorized(
449-
T *block_ptr, ///< [in] Input pointer for loading from
450-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
449+
T *block_ptr, ///< [in] Input pointer for loading from
450+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
451451
{
452452
enum
453453
{
@@ -503,8 +503,8 @@ template <
503503
typename T,
504504
int ITEMS_PER_THREAD>
505505
__device__ __forceinline__ void BlockLoadVectorized(
506-
T *block_ptr, ///< [in] Input pointer for loading from
507-
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
506+
T *block_ptr, ///< [in] Input pointer for loading from
507+
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
508508
{
509509
BlockLoadVectorized<PTX_LOAD_NONE>(block_ptr, items);
510510
}

cub/block/block_reduce.cuh

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ CUB_NS_PREFIX
5050
namespace cub {
5151

5252

53+
54+
/******************************************************************************
55+
* Algorithmic variants
56+
******************************************************************************/
57+
5358
/**
5459
* BlockReduceAlgorithm enumerates alternative algorithms for parallel
5560
* reduction across a CUDA threadblock.
@@ -59,9 +64,13 @@ enum BlockReduceAlgorithm
5964

6065
/**
6166
* \par Overview
62-
* An efficient "raking" reduction algorithm. Execution is comprised of three phases:
63-
* -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
64-
* -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions.
67+
* An efficient "raking" reduction algorithm. Execution is comprised of
68+
* three phases:
69+
* -# Upsweep sequential reduction in registers (if threads contribute more
70+
* than one input each). Each thread then places the partial reduction
71+
* of its item(s) into shared memory.
72+
* -# Upsweep sequential reduction in shared memory. Threads within a
73+
* single warp rake across segments of shared partial reductions.
6574
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
6675
*
6776
* \par
@@ -78,24 +87,34 @@ enum BlockReduceAlgorithm
7887

7988
/**
8089
* \par Overview
81-
* A quick "tiled warp-reductions" reduction algorithm. Execution is comprised of four phases:
82-
* -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
83-
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style reduction within each warp.
84-
* -# A propagation phase where the warp reduction outputs in each warp are updated with the aggregate from each preceding warp.
90+
* A quick "tiled warp-reductions" reduction algorithm. Execution is
91+
* comprised of four phases:
92+
* -# Upsweep sequential reduction in registers (if threads contribute more
93+
* than one input each). Each thread then places the partial reduction
94+
* of its item(s) into shared memory.
95+
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
96+
* reduction within each warp.
97+
* -# A propagation phase where the warp reduction outputs in each warp are
98+
* updated with the aggregate from each preceding warp.
8599
*
86100
* \par
87101
* \image html block_scan_warpscans.png
88102
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
89103
*
90104
* \par Performance Considerations
91105
* - Although this variant may suffer lower overall throughput across the
92-
* GPU because due to a heavy reliance on inefficient warp-reductions, it can
93-
* often provide lower turnaround latencies when the GPU is under-occupied.
106+
* GPU because due to a heavy reliance on inefficient warp-reductions, it
107+
* can often provide lower turnaround latencies when the GPU is
108+
* under-occupied.
94109
*/
95110
BLOCK_REDUCE_WARP_REDUCTIONS,
96111
};
97112

98113

114+
/******************************************************************************
115+
* Block reduce
116+
******************************************************************************/
117+
99118
/**
100119
* \addtogroup BlockModule
101120
* @{
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
bf9ef662dca1d2d176543536669676825e26ede3
1+
e8017594efd372d9a44cf27dfa2fa3c1e7d404f4

0 commit comments

Comments
 (0)