cudareductionkepler

Faster Parallel Reductions on Kepler


I'm just a CUDA beginner and trying to use Faster Parallel Reductions on Kepler on my program, but I didn't get the result, below is a function of what I'm doing, the output is 0, I would be appreciated to know what is my mistake?

#ifndef __CUDACC__  
#define __CUDACC__
#endif

#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <stdio.h>
#include <math.h>

__inline__ __device__
float warpReduceSum(float val) {
  for (int offset = warpSize/2; offset > 0; offset /= 2) 
    val += __shfl_down(val, offset);
  return val;
}

__inline__ __device__
float blockReduceSum(float val) {

  static __shared__ int shared[32]; // Shared mem for 32 partial sums
  int lane = threadIdx.x % warpSize;
  int wid = threadIdx.x / warpSize;

  val = warpReduceSum(val);     // Each warp performs partial reduction

  if (lane==0) shared[wid]=val; // Write reduced value to shared memory

  __syncthreads();              // Wait for all partial reductions

  //read from shared memory only if that warp existed
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

  if (wid==0) val = warpReduceSum(val); //Final reduce within first warp

  return val;
}

__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
  float sum = 0;
  //reduce multiple elements per thread
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) 
  {
    sum += in[i];
  }
  sum = blockReduceSum(sum);
  if (threadIdx.x==0)
    out[blockIdx.x]=sum;
}

int main()
{
    int n = 1000000;
    float *b = new float[1]();
    float *d = new float[1]();
    float *a ;


    int blocks = (n/512)+1;
    float *d_intermediate;

    cudaMalloc((void**)&d_intermediate, n*sizeof(float));
    cudaMalloc((void**)&a, n*sizeof(float));

    cudaMemset(a, 1, n*sizeof(float));

    deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
    deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
    cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(d_intermediate);
    std::cout << d[0];
    return 0;

}

Solution

  • There are various problems with your code:

    1. Any time you are having trouble with a CUDA code, you should use proper cuda error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, it will be useful to others trying to help you. If you had done that with this code, you would be advised of various errors/problems

    2. Any pointer passed to a CUDA kernel should be a valid CUDA device pointer. Your b pointer is a host pointer:

      float *b = new float[1]();
      

      so you cannot use it here:

      deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
                                                       ^
      

      since you evidently want to use it for storage of a single float quantity on the device, we can easily re-use the a pointer for that.

    3. For a similar reason, this isn't sensible:

      cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
      

      in this case both b and d are host pointers. That will not copy data from the device to the host.

    4. This probably doesn't do what you think:

      cudaMemset(a, 1, n*sizeof(float));
      

      I imagine you think this will fill a float array with the quantity 1, but it won't. cudaMemset, like memset, fills bytes and takes a byte quantity. If you use it to fill a float array, you are effectively creating an array filled with 0x01010101. I don't know what value that translates into when you convert the bit pattern to a float quantity, but it will not give you a float value of 1. We'll fix this by filling an ordinary host array with a loop, and then transferring that data to the device to be reduced.

    Here's a modified code that has the above issues addressed, and runs correctly for me:

    $ cat t1290.cu
    #include <iostream>
    #include <stdio.h>
    #include <math.h>
    
    __inline__ __device__
    float warpReduceSum(float val) {
      for (int offset = warpSize/2; offset > 0; offset /= 2)
        val += __shfl_down(val, offset);
      return val;
    }
    
    __inline__ __device__
    float blockReduceSum(float val) {
    
      static __shared__ int shared[32]; // Shared mem for 32 partial sums
      int lane = threadIdx.x % warpSize;
      int wid = threadIdx.x / warpSize;
    
      val = warpReduceSum(val);     // Each warp performs partial reduction
    
      if (lane==0) shared[wid]=val; // Write reduced value to shared memory
    
      __syncthreads();              // Wait for all partial reductions
    
      //read from shared memory only if that warp existed
      val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
    
      if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
    
      return val;
    }
    
    __global__ void deviceReduceKernel(float *in, float* out, size_t N)
    {
      float sum = 0;
      //reduce multiple elements per thread
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
      {
        sum += in[i];
      }
      sum = blockReduceSum(sum);
      if (threadIdx.x==0)
        out[blockIdx.x]=sum;
    }
    
    int main()
    {
            int n = 1000000;
            float b;
            float *a, *a_host;
            a_host = new float[n];
    
            int blocks = (n/512)+1;
            float *d_intermediate;
    
            cudaMalloc((void**)&d_intermediate, blocks*sizeof(float));
            cudaMalloc((void**)&a, n*sizeof(float));
            for (int i = 0; i < n; i++) a_host[i] = 1;
            cudaMemcpy(a, a_host, n*sizeof(float), cudaMemcpyHostToDevice);
    
            deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
            deviceReduceKernel<<<1, 1024>>>(d_intermediate, a, blocks);
            cudaMemcpy(&b, a, sizeof(float), cudaMemcpyDeviceToHost);
            cudaFree(d_intermediate);
            std::cout << b << std::endl;
            return 0;
    }
    $ nvcc -arch=sm_35 -o t1290 t1290.cu
    $ cuda-memcheck ./t1290
    ========= CUDA-MEMCHECK
    1e+06
    ========= ERROR SUMMARY: 0 errors
    $