@@ -319,9 +319,9 @@ struct ScanTileState<T, false>
319319 void * allocations[3 ];
320320 size_t allocation_sizes[3 ];
321321
322- allocation_sizes[0 ] = (num_tiles + TILE_STATUS_PADDING) * sizeof (StatusWord); // bytes needed for tile status descriptors
323- allocation_sizes[1 ] = num_tiles * sizeof (Uninitialized<T>); // bytes needed for partials
324- allocation_sizes[2 ] = num_tiles * sizeof (Uninitialized<T>); // bytes needed for inclusives
322+ allocation_sizes[0 ] = (num_tiles + TILE_STATUS_PADDING) * sizeof (StatusWord); // bytes needed for tile status descriptors
323+ allocation_sizes[1 ] = ( num_tiles + TILE_STATUS_PADDING) * sizeof (Uninitialized<T>); // bytes needed for partials
324+ allocation_sizes[2 ] = ( num_tiles + TILE_STATUS_PADDING) * sizeof (Uninitialized<T>); // bytes needed for inclusives
325325
326326 // Compute allocation pointers into the single storage blob
327327 if (CubDebug (error = AliasTemporaries (d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break ;
@@ -348,8 +348,8 @@ struct ScanTileState<T, false>
348348 // Specify storage allocation requirements
349349 size_t allocation_sizes[3 ];
350350 allocation_sizes[0 ] = (num_tiles + TILE_STATUS_PADDING) * sizeof (StatusWord); // bytes needed for tile status descriptors
351- allocation_sizes[1 ] = num_tiles * sizeof (Uninitialized<T>); // bytes needed for partials
352- allocation_sizes[2 ] = num_tiles * sizeof (Uninitialized<T>); // bytes needed for inclusives
351+ allocation_sizes[1 ] = ( num_tiles + TILE_STATUS_PADDING) * sizeof (Uninitialized<T>); // bytes needed for partials
352+ allocation_sizes[2 ] = ( num_tiles + TILE_STATUS_PADDING) * sizeof (Uninitialized<T>); // bytes needed for inclusives
353353
354354 // Set the necessary size of the blob
355355 void * allocations[3 ];
@@ -383,7 +383,7 @@ struct ScanTileState<T, false>
383383 __device__ __forceinline__ void SetInclusive (int tile_idx, T tile_inclusive)
384384 {
385385 // Update tile inclusive value
386- ThreadStore<STORE_CG>(d_tile_inclusive + tile_idx, tile_inclusive);
386+ ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
387387
388388 // Fence
389389 __threadfence ();
@@ -399,7 +399,7 @@ struct ScanTileState<T, false>
399399 __device__ __forceinline__ void SetPartial (int tile_idx, T tile_partial)
400400 {
401401 // Update tile partial value
402- ThreadStore<STORE_CG>(d_tile_partial + tile_idx, tile_partial);
402+ ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
403403
404404 // Fence
405405 __threadfence ();
@@ -422,8 +422,8 @@ struct ScanTileState<T, false>
422422 status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
423423 }
424424
425- T partial = ThreadLoad<LOAD_CG>(d_tile_partial + tile_idx);
426- T inclusive = ThreadLoad<LOAD_CG>(d_tile_inclusive + tile_idx);
425+ T partial = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
426+ T inclusive = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
427427
428428 value = (status == StatusWord (SCAN_TILE_PARTIAL)) ?
429429 partial :
@@ -475,7 +475,7 @@ struct BlockScanLookbackPrefixOp
475475 };
476476
477477 // Fields
478- ScanTileState &tile_status; // /< Interface to tile status
478+ ScanTileState &tile_status; // /< Interface to tile status
479479 _TempStorage &temp_storage; // /< Reference to a warp-reduction instance
480480 ScanOp scan_op; // /< Binary scan operator
481481 int tile_idx; // /< The current tile index
0 commit comments