NVIDIA
diff --git a/‎CHANGE_LOG.TXT‎
Lines changed: 33 additions & 15 deletions b/‎CHANGE_LOG.TXT‎
Lines changed: 33 additions & 15 deletions
diff --git a/‎cub/block/block_histo_256.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cub/block/block_histo_256.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cub/device/device_histo_256.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cub/device/device_histo_256.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cub/device/device_reduce.cuh‎
Lines changed: 2 additions & 3 deletions b/‎cub/device/device_reduce.cuh‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/Doxyfile‎
Lines changed: 2 additions & 0 deletions b/‎docs/Doxyfile‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/html/annotated.html‎
Lines changed: 4 additions & 4 deletions b/‎docs/html/annotated.html‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/html/block__discontinuity_8cuh.html‎
Lines changed: 1 addition & 1 deletion b/‎docs/html/block__discontinuity_8cuh.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/html/block__exchange_8cuh.html‎
Lines changed: 1 addition & 1 deletion b/‎docs/html/block__exchange_8cuh.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/html/block__histo__256_8cuh.html‎
Lines changed: 3 additions & 3 deletions b/‎docs/html/block__histo__256_8cuh.html‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/html/block__load_8cuh.html‎
Lines changed: 1 addition & 1 deletion b/‎docs/html/block__load_8cuh.html‎
Lines changed: 1 addition & 1 deletion
@@ -1,29 +1,47 @@
 //-----------------------------------------------------------------------------
 
+0.9.3	04/30/2013
+
+    - Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which 
+      trades more register consumption for less shared memory I/O)
+    - Added block-wide histogram (BlockHisto256)
+    - Updates to BlockRadixRank to use BlockScan (which improves performance
+      on Kepler due to SHFL instruction)
+    - Added device-wide histogram (DeviceHisto256)
+    - Fixed compilation errors for some WarpScan entrypoints on SM30+
+    - Allow types other than C++ primitives to be used in WarpScan::*Sum methods 
+      if they only have operator + overloaded.  (Previously they also required 
+      to support assignment from int(0).) 
+    - Update BlockReduce's BLOCK_REDUCE_WARP_REDUCTIONS algorithm to work even 
+      when block size is not an even multiple of warp size
+    - Added work management utility descriptors (GridQueue, GridEvenShare)
+    - Refactoring of DeviceAllocator interface and CachingDeviceAllocator 
+      implementation 
+    - Misc. documentation updates and corrections. 
+     
+//-----------------------------------------------------------------------------
+
 0.9.2	04/04/2013
 
-		- Added WarpReduce.  WarpReduce uses the SHFL instruction when applicable.
-		  BlockReduce now uses this WarpReduce instead of implementing its own.
-		    
-		- Misc. fixes for 64-bit Linux compilation warnings and errors.
-		
-		- Misc. documentation updates and corrections. 
+    - Added WarpReduce.  WarpReduce uses the SHFL instruction when applicable. 
+      BlockReduce now uses this WarpReduce instead of implementing its own.
+    - Misc. fixes for 64-bit Linux compilation warnings and errors.
+    - Misc. documentation updates and corrections. 
 
 //-----------------------------------------------------------------------------
 
 0.9.1	03/09/2013
 
-		- Fix for ambiguity in BlockScan::Reduce() between generic reduction and 
-		  summation.  Summation entrypoints are now called ::Sum(), similar
-		  to the convention in BlockScan.
-		    
-		- Small edits to mainpage documentation and download tracking
-		
+    - Fix for ambiguity in BlockScan::Reduce() between generic reduction and 
+      summation.  Summation entrypoints are now called ::Sum(), similar to the 
+      convention in BlockScan.
+    - Small edits to mainpage documentation and download tracking
+    
 //-----------------------------------------------------------------------------
 
 0.9.0	03/07/2013	
 
-		- Intial "preview" release.	CUB is the first durable, high-performance library 
-		  of cooperative block-level, warp-level, and thread-level primitives for CUDA 
-		  kernel programming.  More primitives and examples coming soon!
+    - Intial "preview" release.	CUB is the first durable, high-performance library 
+      of cooperative block-level, warp-level, and thread-level primitives for CUDA 
+      kernel programming.  More primitives and examples coming soon!
 
@@ -28,7 +28,7 @@
 
 /**
  * \file
- * cub::BlockHisto256 provides methods for constructing (and compositing into) 256-valued histograms from 8b data partitioned across threads within a CUDA thread block.
+ * cub::BlockHisto256 provides methods for constructing (and compositing into) 256-bin histograms from 8b data partitioned across threads within a CUDA thread block.
  */
 
 #pragma once
@@ -78,7 +78,7 @@ enum BlockHisto256Algorithm
  */
 
 /**
- * \brief BlockHisto256 provides methods for constructing (and compositing into) 256-valued histograms from 8b data partitioned across threads within a CUDA thread block. ![](histogram_logo.png)
+ * \brief BlockHisto256 provides methods for constructing (and compositing into) 256-bin histograms from 8b data partitioned across threads within a CUDA thread block. ![](histogram_logo.png)
  *
  * \par Overview
  * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
 
@@ -29,7 +29,7 @@
 
 /**
  * \file
- * cub::DeviceHisto256 provides variants of device-wide parallel histogram over data residing within global memory.
+ * cub::DeviceHisto256 provides device-wide parallel operations for constructing 256-bin histogram(s) over data samples residing within global memory.
  */
 
 #pragma once
@@ -169,7 +169,7 @@ __global__ void FinalizeHisto256Kernel(
  */
 
 /**
- * \brief DeviceHisto256 provides variants of device-wide parallel histogram over data residing within global memory. ![](histogram_logo.png)
+ * \brief DeviceHisto256 provides device-wide parallel operations for constructing 256-bin histogram(s) over samples data residing within global memory. ![](histogram_logo.png)
  */
 struct DeviceHisto256
 {
 
@@ -29,8 +29,7 @@
 
 /**
  * \file
- * cub::DeviceReduce provides variants of parallel reduction data residing
- * within a CUDA device's global memory.
+ * cub::DeviceReduce provides device-wide parallel operations for reducing data items residing within a CUDA device's global memory.
  */
 
 #pragma once
@@ -158,7 +157,7 @@ __global__ void SingleBlockReduceKernel(
  */
 
 /**
- * \brief DeviceReduce provides variants of parallel reduction data residing within a CUDA device's global memory. ![](reduce_logo.png)
+ * \brief DeviceReduce provides device-wide parallel operations for reducing data items residing within a CUDA device's global memory. ![](reduce_logo.png)
  */
 struct DeviceReduce
 {
 
@@ -956,12 +956,14 @@ HTML_EXTRA_STYLESHEET  = extra_stylesheet.css
 # the files will be copied as-is; there are no commands or markers available.
 
 HTML_EXTRA_FILES       = download_cub.html 
+HTML_EXTRA_FILES       += images/nvresearch.png 
 HTML_EXTRA_FILES       += images/download-icon.png 
 HTML_EXTRA_FILES       += images/groups-icon.png 
 HTML_EXTRA_FILES       += images/github-icon-747d8b799a48162434b2c0595ba1317e.png 
 HTML_EXTRA_FILES       += images/favicon.ico 
 HTML_EXTRA_FILES       += images/favicon.png 
 HTML_EXTRA_FILES       += images/tab_b_alt.png 
+HTML_EXTRA_FILES       += images/generic_abstraction.png 
 HTML_EXTRA_FILES       += images/simt_abstraction.png 
 HTML_EXTRA_FILES       += images/kernel_abstraction.png 
 HTML_EXTRA_FILES       += images/devfun_abstraction.png
 
@@ -113,7 +113,7 @@
 <img src="transpose_logo.png" alt="transpose_logo.png"/>
 </div>
 </td></tr>
-<tr id="row_0_6_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_block_histo256.html" target="_self">BlockHisto256</a></td><td class="desc"><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-valued histograms from 8b ...">BlockHisto256</a> provides methods for constructing (and compositing into) 256-valued histograms from 8b data partitioned across threads within a CUDA thread block. </p>
+<tr id="row_0_6_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_block_histo256.html" target="_self">BlockHisto256</a></td><td class="desc"><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-bin histograms from 8b dat...">BlockHisto256</a> provides methods for constructing (and compositing into) 256-bin histograms from 8b data partitioned across threads within a CUDA thread block. </p>
 <div class="image">
 <img src="histogram_logo.png" alt="histogram_logo.png"/>
 </div>
@@ -152,12 +152,12 @@
 <tr id="row_0_14_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_cast.html" target="_self">Cast</a></td><td class="desc">Default cast functor</td></tr>
 <tr id="row_0_15_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_device.html" target="_self">Device</a></td><td class="desc">Properties of a given CUDA device and the corresponding PTX bundle</td></tr>
 <tr id="row_0_16_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_device_allocator.html" target="_self">DeviceAllocator</a></td><td class="desc">Abstract base allocator class for device memory allocations</td></tr>
-<tr id="row_0_17_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_device_histo256.html" target="_self">DeviceHisto256</a></td><td class="desc"><a class="el" href="structcub_1_1_device_histo256.html" title="DeviceHisto256 provides variants of device-wide parallel histogram over data residing within global m...">DeviceHisto256</a> provides variants of device-wide parallel histogram over data residing within global memory. </p>
+<tr id="row_0_17_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_device_histo256.html" target="_self">DeviceHisto256</a></td><td class="desc"><a class="el" href="structcub_1_1_device_histo256.html" title="DeviceHisto256 provides device-wide parallel operations for constructing 256-bin histogram(s) over sa...">DeviceHisto256</a> provides device-wide parallel operations for constructing 256-bin histogram(s) over samples data residing within global memory. </p>
 <div class="image">
 <img src="histogram_logo.png" alt="histogram_logo.png"/>
 </div>
 </td></tr>
-<tr id="row_0_18_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_device_reduce.html" target="_self">DeviceReduce</a></td><td class="desc"><a class="el" href="structcub_1_1_device_reduce.html" title="DeviceReduce provides variants of parallel reduction data residing within a CUDA device&#39;s global memo...">DeviceReduce</a> provides variants of parallel reduction data residing within a CUDA device's global memory. </p>
+<tr id="row_0_18_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_device_reduce.html" target="_self">DeviceReduce</a></td><td class="desc"><a class="el" href="structcub_1_1_device_reduce.html" title="DeviceReduce provides device-wide parallel operations for reducing data items residing within a CUDA ...">DeviceReduce</a> provides device-wide parallel operations for reducing data items residing within a CUDA device's global memory. </p>
 <div class="image">
 <img src="reduce_logo.png" alt="reduce_logo.png"/>
 </div>
@@ -198,7 +198,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Tue Apr 30 2013 01:43:33 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue Apr 30 2013 15:22:27 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
 
@@ -132,7 +132,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Tue Apr 30 2013 01:43:33 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue Apr 30 2013 15:22:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
 
@@ -131,7 +131,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Tue Apr 30 2013 01:43:33 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue Apr 30 2013 15:22:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
 
@@ -112,7 +112,7 @@
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="nested-classes"></a>
 Classes</h2></td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_block_histo256.html">cub::BlockHisto256&lt; BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-valued histograms from 8b ...">BlockHisto256</a> provides methods for constructing (and compositing into) 256-valued histograms from 8b data partitioned across threads within a CUDA thread block. </p>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-bin histograms from 8b dat...">BlockHisto256</a> provides methods for constructing (and compositing into) 256-bin histograms from 8b data partitioned across threads within a CUDA thread block. </p>
 <div class="image">
 <img src="histogram_logo.png" alt="histogram_logo.png"/>
 <div class="caption">
@@ -134,12 +134,12 @@
 <tr class="separator:a0f61554b5c901fcc01adb8af3d9aacca"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
-<div class="textblock"><p><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-valued histograms from 8b ...">cub::BlockHisto256</a> provides methods for constructing (and compositing into) 256-valued histograms from 8b data partitioned across threads within a CUDA thread block. </p>
+<div class="textblock"><p><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-bin histograms from 8b dat...">cub::BlockHisto256</a> provides methods for constructing (and compositing into) 256-bin histograms from 8b data partitioned across threads within a CUDA thread block. </p>
 </div></div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Tue Apr 30 2013 01:43:33 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue Apr 30 2013 15:22:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
 
@@ -207,7 +207,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Tue Apr 30 2013 01:43:33 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue Apr 30 2013 15:22:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>