I'm just a CUDA beginner and trying to use Faster Parallel Reductions on Kepler on my program, but I didn't get the result, below is a function of what I'm doing, the output is 0, I would be appreciated to know what is my mistake?
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <stdio.h>
#include <math.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
float sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
int main()
{
int n = 1000000;
float *b = new float[1]();
float *d = new float[1]();
float *a ;
int blocks = (n/512)+1;
float *d_intermediate;
cudaMalloc((void**)&d_intermediate, n*sizeof(float));
cudaMalloc((void**)&a, n*sizeof(float));
cudaMemset(a, 1, n*sizeof(float));
deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_intermediate);
std::cout << d[0];
return 0;
}
There are various problems with your code:
Any time you are having trouble with a CUDA code, you should use proper cuda error checking and run your code with cuda-memcheck
, before asking others for help. Even if you don't understand the error output, it will be useful to others trying to help you. If you had done that with this code, you would be advised of various errors/problems
Any pointer passed to a CUDA kernel should be a valid CUDA device pointer. Your b
pointer is a host pointer:
float *b = new float[1]();
so you cannot use it here:
deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
^
since you evidently want to use it for storage of a single float
quantity on the device, we can easily re-use the a
pointer for that.
For a similar reason, this isn't sensible:
cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
in this case both b
and d
are host pointers. That will not copy data from the device to the host.
This probably doesn't do what you think:
cudaMemset(a, 1, n*sizeof(float));
I imagine you think this will fill a float
array with the quantity 1, but it won't. cudaMemset
, like memset
, fills bytes and takes a byte quantity. If you use it to fill a float
array, you are effectively creating an array filled with 0x01010101
. I don't know what value that translates into when you convert the bit pattern to a float
quantity, but it will not give you a float
value of 1. We'll fix this by filling an ordinary host array with a loop, and then transferring that data to the device to be reduced.
Here's a modified code that has the above issues addressed, and runs correctly for me:
$ cat t1290.cu
#include <iostream>
#include <stdio.h>
#include <math.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
float sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
int main()
{
int n = 1000000;
float b;
float *a, *a_host;
a_host = new float[n];
int blocks = (n/512)+1;
float *d_intermediate;
cudaMalloc((void**)&d_intermediate, blocks*sizeof(float));
cudaMalloc((void**)&a, n*sizeof(float));
for (int i = 0; i < n; i++) a_host[i] = 1;
cudaMemcpy(a, a_host, n*sizeof(float), cudaMemcpyHostToDevice);
deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
deviceReduceKernel<<<1, 1024>>>(d_intermediate, a, blocks);
cudaMemcpy(&b, a, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_intermediate);
std::cout << b << std::endl;
return 0;
}
$ nvcc -arch=sm_35 -o t1290 t1290.cu
$ cuda-memcheck ./t1290
========= CUDA-MEMCHECK
1e+06
========= ERROR SUMMARY: 0 errors
$