Hello everyone I'm trying to use grid-stride method and atomic functions to do multi-block reduction.
I know that the usual way to do this is to launch two kernels or use lastblock method as directed in this note.(or this tutorial)
However, I thought this could also be done by using grid-stride with atomic code.
As I tested, it worked very well..
until for some number, it gives the wrong answer. (which is very weird)
I have tested for some "n"s and found that I get wrong answer for n = 1234565, 1234566, 1234567.
This is my whole code of doing n sum of 1. So the answer should be n.
Any help or comment is appreciated.
#include<iostream>
__global__ void stride_sum(const double* input,
const int size,
double* sumOut){
extern __shared__ double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
for(unsigned int s=i;
s<size;
s+=blockDim.x*gridDim.x){
sm[tid] = input[i];
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
}
int main(){
unsigned int n = 1234567;
int blockSize = 4;
int nBlocks = (n + blockSize - 1) / blockSize;
int sharedMemory = sizeof(double)*blockSize;
double *data, *sum;
cudaMallocManaged(&data, sizeof(double)*n);
cudaMallocManaged(&sum, sizeof(double));
std::fill_n(data,n,1.);
std::fill_n(sum,1,0.);
stride_sum<<<nBlocks, blockSize, sharedMemory>>>(data,n,sum);
cudaDeviceSynchronize();
printf("res: 10.f \n",sum[0]);
cudaFree(data);
cudaFree(sum);
return 0;
}
You have gotten quite a lot wrong in your implementation. This will work:
__global__ void stride_sum(const double* input,
const int size,
double* sumOut)
{
extern __shared__ volatile double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
double val = 0.;
for(unsigned int s=i; s<size; s+=blockDim.x*gridDim.x){
val += input[i];
}
// Load partial sum to memory
sm[tid] = val;
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
[Never compiled and run, use a own risk]
In short -- do the grid strided summation, then a single shared memory reduction, then a single atomic update. Your implementation has undefined behaviour in a few places, especially the conditionally executed __syncthreads
calls and using uninitialized shared memory when some threads fall out of the summation loop.