What is the proper way to use stride in cuda to do multiblock reduction?

Hello everyone I'm trying to use grid-stride method and atomic functions to do multi-block reduction.
I know that the usual way to do this is to launch two kernels or use lastblock method as directed in this note.(or this tutorial)

However, I thought this could also be done by using grid-stride with atomic code.
As I tested, it worked very well..
until for some number, it gives the wrong answer. (which is very weird)

I have tested for some "n"s and found that I get wrong answer for n = 1234565, 1234566, 1234567.
This is my whole code of doing n sum of 1. So the answer should be n.
Any help or comment is appreciated.

#include<iostream>

__global__ void stride_sum(const double* input,
                           const int size,
                           double* sumOut){
    extern __shared__ double sm[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockDim.x * blockIdx.x + tid;

    //doing grid loop using stride method.
    for(unsigned int s=i;
            s<size;
            s+=blockDim.x*gridDim.x){
        sm[tid] = input[i];
        __syncthreads();

        //doing parallel reduction.
        for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
            if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
            __syncthreads();
        }

        //atomically add results to sumOut.
        if(tid==0) atomicAdd(sumOut, sm[0]);
    }
}

int main(){

    unsigned int n = 1234567;
    int blockSize = 4;
    int nBlocks = (n + blockSize - 1) / blockSize;
    int sharedMemory = sizeof(double)*blockSize;

    double *data, *sum;

    cudaMallocManaged(&data, sizeof(double)*n);
    cudaMallocManaged(&sum, sizeof(double));

    std::fill_n(data,n,1.);
    std::fill_n(sum,1,0.);

    stride_sum<<<nBlocks, blockSize, sharedMemory>>>(data,n,sum);

    cudaDeviceSynchronize();

    printf("res: 10.f \n",sum[0]);

    cudaFree(data);
    cudaFree(sum);

    return 0;
}

Solution

You have gotten quite a lot wrong in your implementation. This will work:

__global__ void stride_sum(const double* input,
                           const int size,
                           double* sumOut)
{
    extern __shared__ volatile double sm[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockDim.x * blockIdx.x + tid;

    //doing grid loop using stride method.
    double val = 0.;
    for(unsigned int s=i; s<size; s+=blockDim.x*gridDim.x){
        val += input[i]; 
    }

    // Load partial sum to memory
    sm[tid] = val; 
    __syncthreads();

    //doing parallel reduction.
    for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
        if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
        __syncthreads();
    }

   //atomically add results to sumOut.
   if(tid==0) atomicAdd(sumOut, sm[0]);
}

[Never compiled and run, use a own risk]

In short -- do the grid strided summation, then a single shared memory reduction, then a single atomic update. Your implementation has undefined behaviour in a few places, especially the conditionally executed __syncthreads calls and using uninitialized shared memory when some threads fall out of the summation loop.