I am trying to understand the scan implementation scan-then-fan mentioned in the book: The CUDA Handbook.
scanWarp
? Why negative indexes? Could you please mention a numerical example?warpPartials[16+warpid] = sum
. How the assignment is happening?if ( warpid==0 ) {scanWarp<T,bZeroPadded>( 16+warpPartials+tid ); }
sum += warpPartials[16+warpid-1];
? An numerical example will be highly appreciated. *sPartials = sum;
to store values in sPartials
? PS: A numerical example that demonstrates the whole execution would be very helpful.
template < class T, bool bZeroPadded >
inline __device__ T
scanBlock( volatile T *sPartials ){
extern __shared__ T warpPartials[];
const int tid = threadIdx.x;
const int lane = tid & 31;
const int warpid = tid >> 5;
//
// Compute this thread's partial sum
//
T sum = scanWarp<T,bZeroPadded>( sPartials );
__syncthreads();
//
// Write each warp's reduction to shared memory
//
if ( lane == 31 ) {
warpPartials[16+warpid] = sum;
}
__syncthreads();
//
// Have one warp scan reductions
//
if ( warpid==0 ) {
scanWarp<T,bZeroPadded>( 16+warpPartials+tid );
}
__syncthreads();
//
// Fan out the exclusive scan element (obtained
// by the conditional and the decrement by 1)
// to this warp's pending output
//
if ( warpid > 0 ) {
sum += warpPartials[16+warpid-1];
}
__syncthreads();
//
// Write this thread's scan output
//
*sPartials = sum;
__syncthreads();
//
// The return value will only be used by caller if it
// contains the spine value (i.e. the reduction
// of the array we just scanned).
//
return sum;
}
template < class T >
inline __device__ T
scanWarp( volatile T *sPartials ){
const int tid = threadIdx.x;
const int lane = tid & 31;
if ( lane >= 1 ) sPartials[0] += sPartials[- 1];
if ( lane >= 2 ) sPartials[0] += sPartials[- 2];
if ( lane >= 4 ) sPartials[0] += sPartials[- 4];
if ( lane >= 8 ) sPartials[0] += sPartials[- 8];
if ( lane >= 16 ) sPartials[0] += sPartials[-16];
return sPartials[0];
}
(Answer rewritten now I have more time)
Here's an example (based on an implementation I wrote in C++ AMP, not CUDA). To make the diagram smaller each warp is 4 elements wide and a block is 16 elements.
The following paper is also pretty useful Efficient Parallel Scan Algorithms for GPUs. As is Parallel Scan for Stream Architectures.