cudareductionstride

What is the proper way to use stride in cuda to do multiblock reduction?


Hello everyone I'm trying to use grid-stride method and atomic functions to do multi-block reduction.
I know that the usual way to do this is to launch two kernels or use lastblock method as directed in this note.(or this tutorial)

However, I thought this could also be done by using grid-stride with atomic code.
As I tested, it worked very well..
until for some number, it gives the wrong answer. (which is very weird)

I have tested for some "n"s and found that I get wrong answer for n = 1234565, 1234566, 1234567.
This is my whole code of doing n sum of 1. So the answer should be n.
Any help or comment is appreciated.

#include<iostream>

__global__ void stride_sum(const double* input,
                           const int size,
                           double* sumOut){
    extern __shared__ double sm[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockDim.x * blockIdx.x + tid;

    //doing grid loop using stride method.
    for(unsigned int s=i;
            s<size;
            s+=blockDim.x*gridDim.x){
        sm[tid] = input[i];
        __syncthreads();

        //doing parallel reduction.
        for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
            if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
            __syncthreads();
        }

        //atomically add results to sumOut.
        if(tid==0) atomicAdd(sumOut, sm[0]);
    }
}

int main(){

    unsigned int n = 1234567;
    int blockSize = 4;
    int nBlocks = (n + blockSize - 1) / blockSize;
    int sharedMemory = sizeof(double)*blockSize;

    double *data, *sum;

    cudaMallocManaged(&data, sizeof(double)*n);
    cudaMallocManaged(&sum, sizeof(double));

    std::fill_n(data,n,1.);
    std::fill_n(sum,1,0.);

    stride_sum<<<nBlocks, blockSize, sharedMemory>>>(data,n,sum);

    cudaDeviceSynchronize();

    printf("res: 10.f \n",sum[0]);

    cudaFree(data);
    cudaFree(sum);

    return 0;
}


Solution

  • You have gotten quite a lot wrong in your implementation. This will work:

    __global__ void stride_sum(const double* input,
                               const int size,
                               double* sumOut)
    {
        extern __shared__ volatile double sm[];
    
        unsigned int tid = threadIdx.x;
        unsigned int i = blockDim.x * blockIdx.x + tid;
    
        //doing grid loop using stride method.
        double val = 0.;
        for(unsigned int s=i; s<size; s+=blockDim.x*gridDim.x){
            val += input[i]; 
        }
    
        // Load partial sum to memory
        sm[tid] = val; 
        __syncthreads();
    
        //doing parallel reduction.
        for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
            if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
            __syncthreads();
        }
    
       //atomically add results to sumOut.
       if(tid==0) atomicAdd(sumOut, sm[0]);
    }
    

    [Never compiled and run, use a own risk]

    In short -- do the grid strided summation, then a single shared memory reduction, then a single atomic update. Your implementation has undefined behaviour in a few places, especially the conditionally executed __syncthreads calls and using uninitialized shared memory when some threads fall out of the summation loop.