I am trying to implement sum reduction using CUDA, however I want the reduction to be to the right not to the left.. I wrote the below code, but I am not sure why it is not working
__global__ void reduce_kernel(
float *input,
float *partialSums,
unsigned int N)
{
unsigned int segment = blockIdx.x * blockDim.x * 2;
unsigned int i = segment + threadIdx.x;
__shared__ float input_s[BLOCK_DIM];
input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
int count = 2;
__syncthreads();
for (unsigned int stride = BLOCK_DIM / 2;
stride < BLOCK_DIM;
stride = stride + (BLOCK_DIM / count))
{
if (threadIdx.x >= stride) {
count = count * 2;
input_s[threadIdx.x] += input_s[threadIdx.x - stride];
printf("%d ", stride);
__syncthreads();
if (stride == BLOCK_DIM - 1) {
break;
}
}
__syncthreads();
}
if (threadIdx.x == BLOCK_DIM - 1) {
partialSums[blockIdx.x] = input_s[threadIdx.x];
}
}
Any ideas what I am doing wrong?
This should be doing exactly what you want as long as the input number of elements is a power of two. The partial sum should end up to the right. The stride in such an algorithm has to grow from 1
to BLOCK_DIM / 2
(gives more warp divergences) or shrink from BLOCK_DIM / 2
to 1
. Either way it should be doing so by being multiplied/divided by 2
.
__global__ void reduce_kernel(
float *input,
float *partialSums,
unsigned int N)
{
unsigned int segment = blockIdx.x * blockDim.x * 2;
unsigned int i = segment + threadIdx.x;
__shared__ float input_s[BLOCK_DIM];
input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
__syncthreads();
for (unsigned int stride = BLOCK_DIM / 2;
stride > 0;
stride /= 2)
{
if (threadIdx.x >= BLOCK_DIM - stride) {
input_s[threadIdx.x] += input_s[threadIdx.x - stride];
}
__syncthreads();
}
if (threadIdx.x == BLOCK_DIM - 1) {
partialSums[blockIdx.x] = input_s[threadIdx.x];
}
}
The __syncthreads();
inside the conditional is another error, as all threads of the block have to participate in the synchronization. Otherwise undefined behavior is caused.