You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
- Fix for ambiguity in BlockScan::Reduce() between generic reduction and
summation. Summation entrypoints are now called ::Sum(), similar
to the convention in BlockScan.
- Small edits to mainpage documentation and download tracking
- Refactor test and docs outside of cub directory. Take generated html
docs off gitignore.
Former-commit-id: 98c1a6b
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
306
-
*
307
-
* The return value is undefined in threads other than thread<sub>0</sub>.
308
-
*
309
-
* \smemreuse
310
-
*/
311
-
static__device____forceinline__ T Reduce(
312
-
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
313
-
T input) ///< [in] Calling thread's input
314
-
{
315
-
Sum<T> reduction_op;
316
-
returnReduce(smem_storage, input, reduction_op);
317
-
}
318
-
319
-
/**
320
-
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
321
-
*
322
-
* The return value is undefined in threads other than thread<sub>0</sub>.
323
-
*
324
-
* \smemreuse
325
-
*
326
-
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
327
-
*/
328
-
template <int ITEMS_PER_THREAD>
329
-
static__device____forceinline__ T Reduce(
330
-
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
331
-
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
332
-
{
333
-
Sum<T> reduction_op;
334
-
returnReduce(smem_storage, inputs, reduction_op);
335
-
}
336
-
337
-
338
-
/**
339
-
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p valid_threads threads each contribute one input element.
340
-
*
341
-
* \smemreuse
342
-
*
343
-
* The return value is undefined in threads other than thread<sub>0</sub>.
344
-
*/
345
-
static__device____forceinline__ T Reduce(
346
-
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
347
-
T input, ///< [in] Calling thread's input
348
-
constunsignedint &valid_threads) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
384
+
*
385
+
* The return value is undefined in threads other than thread<sub>0</sub>.
386
+
*
387
+
* \smemreuse
388
+
*/
389
+
static__device____forceinline__ T Sum(
390
+
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
391
+
T input) ///< [in] Calling thread's input
392
+
{
393
+
cub::Sum<T> reduction_op;
394
+
returnReduce(smem_storage, input, reduction_op);
395
+
}
396
+
397
+
/**
398
+
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
399
+
*
400
+
* The return value is undefined in threads other than thread<sub>0</sub>.
401
+
*
402
+
* \smemreuse
403
+
*
404
+
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
405
+
*/
406
+
template <int ITEMS_PER_THREAD>
407
+
static__device____forceinline__ T Sum(
408
+
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
409
+
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
410
+
{
411
+
cub::Sum<T> reduction_op;
412
+
returnReduce(smem_storage, inputs, reduction_op);
413
+
}
414
+
433
415
416
+
/**
417
+
* \brief Computes a threadblock-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p valid_threads threads each contribute one input element.
418
+
*
419
+
* \smemreuse
420
+
*
421
+
* The return value is undefined in threads other than thread<sub>0</sub>.
422
+
*/
423
+
static__device____forceinline__ T Sum(
424
+
SmemStorage &smem_storage, ///< [in] Shared reference to opaque SmemStorage layout
425
+
T input, ///< [in] Calling thread's input
426
+
constunsignedint &valid_threads) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
0 commit comments