CUDA array reduction optimisation

I have two arrays x (of size N ~1-100 millions) and a (greatly smaller Na ~1000-10000) and I want to use x to define a as

for(int j = 0; j < N; j++) {
  float  i = floor( x[j] / da); // in principle i < size(a)

  a[(int)i] += 0.5;
  a[(int)i+1] += 0.5; // I simplify the problem
}

For the context x are the particle positions and a are the number of particles per cell.

I want to execute this function in CUDA. The main issue is I can have several modifications of the same memory at the same time since xis not sorted.

I found the following solution, but I find it very slow. I define a temporary array d_temp_a of size Na * number of threads used. Then, I reduce it to my full array.

Here is the code (use nvcc -std=c++11 example_reduce.cu -o example_reduce.out)

#include "stdio.h"
#include <cuda.h>
#include <random>
using namespace std;


__global__ void getA(float *d_x, float *d_a, float *d_temp_a, int N, int Na, float da)
{
// Get our global thread ID
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  float ix ;

  // Compute a
  for(int x = index; x < N; x += stride) {
      ix = floor( d_x[x] / da );

      d_temp_a[((int)ix) + Na * index] += 0.5;
      d_temp_a[((int)ix + 1) + Na * index] += 0.5;
  }
  __syncthreads();


  // Reduce
  for(int l = index; l < Na; l += stride) {
      for(int m = 0; m < stride; m += 1) {
          d_a[l] += d_temp_a[l + Na * m];
      }
  }
  __syncthreads();
}


int main(int argc, char **argv)
{

  int N = 1000000;   
  int Na = 4096;   

  float L = 50; // box size
  float dxMesh = L / Na; // cell size

  float *h_x, *h_a;  // host data

  h_x = (float *)malloc(N * sizeof(float));
  h_a = (float *)malloc(Na * sizeof(float));

  /* Initialize random seed: */
  std::default_random_engine generator;
  std::uniform_real_distribution<float> generate_unif_dist(0.0,1.0);

  // h_x random initialisation
  for(int x = 0; x < N; x++) {
        float random = generate_unif_dist(generator);
        h_x[x] = random * L;
    }

 
  int blockSize = 512; // Number of threads in each thread block
  int gridSize = (int)ceil((float) N /blockSize); // Number of thread blocks in grid

  float *d_x, *d_a;  // device data

  cudaMalloc((void **) &d_x, N * sizeof(float));
  cudaMalloc((void **) &d_a, Na * sizeof(float));

  cudaMemcpy(d_x, h_x, N * sizeof(float), cudaMemcpyHostToDevice);

  // Create temp d_a array
  float *d_temp_a;
  cudaMalloc((void **) &d_temp_a, Na * blockSize * gridSize * sizeof(float));

  getA<<<gridSize,blockSize>>>(d_x, d_a, d_temp_a, N, Na, da);

  cudaMemcpy(h_a, d_a, Na * sizeof(float), cudaMemcpyDeviceToHost);

  free(h_x);
  free(h_a);

  cudaFree(d_x);
  cudaFree(d_a);
  cudaFree(d_temp_a);

  return 0;
}

It is slow because I only use 1 thread for every element of my array. My question: Is there a way to optimize this reduction? I also found inefficient to have this extremely large array of size Na * number of threads. Is there a way to avoid using it ?

Note that, I intend to write later a 2D version with x and y defining a[i][j].

Solution

I think your approach might be a bit of an overkill for this problem.

Like the other people in the comments, I also thought that you could implement your idea of reduction with thrust. However, my approach involved counting up the appearances of each idx and then inserting these counts (see Counting occurrences of numbers in a CUDA array)

Thrust reduction approach

So i implemented this almost exclusively with thrust methods (fill, transform, sort, reduce_by_key) and ran a final kernel over the end result to split the values between the two neighbouring cells. This worked and was much faster than your CUDA approach, but it was still much slower than the simple CPU implementation. The big problem was the sorting and reduce_by_key of the N values.

struct custom_functor{
    float factor;
    custom_functor(float _factor){
      factor = _factor;
    }
    __host__ __device__ int operator()(float &x) const {
        return (int) floor(x / factor);
    }
};

__global__ void thrust_reduce_kernel(float *d_a, int* d_a_idxs, int* d_a_cnts, int N, int Na, int n_entries)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  if (index >= n_entries)
    return;

  int a_idx = d_a_idxs[index];
  int a_cnt = d_a_cnts[index];

  if ((a_idx + 1) >= Na || a_idx < 0 || a_idx >= Na || (a_idx + 1) < 0)
  {
    printf("Should not happen according to you!\n");
    return;
  }

  atomicAdd(&d_a[a_idx], a_cnt * 0.5f);
  atomicAdd(&d_a[a_idx+1], a_cnt * 0.5f);
}

void test_thrust_reduce(float *d_x, float *d_a, float *h_a, int N, int Na, float da)
{
  int *d_xi, *d_ones;
  int *d_a_cnt_keys, *d_a_cnt_vals;

  cudaMalloc((void**) &d_xi, N * sizeof(int));
  cudaMalloc((void**) &d_ones, N * sizeof(float));

  cudaMalloc((void**) &d_a_cnt_keys, Na * sizeof(int));
  cudaMalloc((void**) &d_a_cnt_vals, Na * sizeof(int));
  CUDA_CHECK;

  thrust::device_ptr<float> dt_x(d_x);
  thrust::device_ptr<float> dt_a(d_a);
  thrust::device_ptr<int> dt_xi(d_xi);
  thrust::device_ptr<int> dt_ones(d_ones);
  thrust::device_ptr<int> dt_a_cnt_keys(d_a_cnt_keys);
  thrust::device_ptr<int> dt_a_cnt_vals(d_a_cnt_vals);

  custom_functor f(da);
  thrust::fill(thrust::device, dt_a, dt_a + Na, 0.0f);
  thrust::fill(thrust::device, dt_ones, dt_ones + N, 1);
  thrust::fill(thrust::device, dt_a_cnt_keys, dt_a_cnt_keys + Na, -1);
  thrust::fill(thrust::device, dt_a_cnt_vals, dt_a_cnt_vals + Na, 0);

  thrust::transform(thrust::device, dt_x, dt_x + N, dt_xi, f);
  thrust::sort(thrust::device, dt_xi, dt_xi + N);

  thrust::pair<thrust::device_ptr<int>,thrust::device_ptr<int>> new_end;
  new_end = thrust::reduce_by_key(thrust::device, dt_xi, dt_xi + N, dt_ones, 
                                  dt_a_cnt_keys, dt_a_cnt_vals);

  int n_entries = new_end.first - dt_a_cnt_keys;
  int n_entries_2 = new_end.first - dt_a_cnt_keys;

  dim3 dimBlock(256);
  dim3 dimGrid((n_entries + dimBlock.x - 1) / dimBlock.x);
  thrust_reduce_kernel<<<dimGrid, dimBlock>>>(d_a, d_a_cnt_keys, d_a_cnt_vals, N, Na, n_entries);
  cudaMemcpy(h_a, d_a, Na * sizeof(float), cudaMemcpyDeviceToHost);

  cudaFree(d_xi);
  cudaFree(d_ones);
  cudaFree(d_a_cnt_keys);
  cudaFree(d_a_cnt_vals);
}

Simple atomicAdd approach

So i was curious if you could just use a simple atomicAdd per entry in d_x and this proved to be the fastest solution of all of them.

__global__ void simple_atomicAdd_kernel(const float *d_x, float *d_a, float da, int N, int Na)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  if (index >= N)
    return;

  int a_idx = floor(d_x[index] / da); // in principle i < size(a)

  atomicAdd(&d_a[a_idx], 0.5f);
  atomicAdd(&d_a[a_idx+1], 0.5f);
} 

void test_simple_atomicAdd(float *d_x, float *d_a, float *h_a, int N, int Na, float da)
{
  cudaMemset(d_a, 0, Na * sizeof(float));

  dim3 dimBlock(256);
  dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x);
  simple_atomicAdd_kernel<<<dimGrid, dimBlock>>>(d_x, d_a, da, N, Na);
  cudaMemcpy(h_a, d_a, Na * sizeof(float), cudaMemcpyDeviceToHost);
}

Results

You can see my times for N=100,000 and da=0.1 below. Your initial value of N = 1,000,000 resulted in an out_of_memory exception for me. All

Times: 
- CPU Reference:         912 us
- CUDA Custom reduce:    34275 us
- CUDA Thrust reduce:    2144 us
- CUDA Simple atomicAdd: 59 us

Looking at higher values for N, the Thrust reduce approach starts to get better, because we have much more conflicts in the atomicAdd approach. This is very dependent on your x values and the value of da:

Times (N=1,000,000, da=0.1): 
- CPU Reference:         9398 us
- CUDA Thrust reduce:    1287 us
- CUDA Simple atomicAdd: 409 us

Times (N=10,000,000, da=0.1): 
- CPU Reference:         92068 us
- CUDA Thrust reduce:    3879 us
- CUDA Simple atomicAdd: 3851 us

Times (N=100,000,000, da=0.1): 
- CPU Reference:         918950 us
- CUDA Thrust reduce:    21051 us
- CUDA Simple atomicAdd: 38583 us

Disclaimer: I am far from an expert in CUDA programming and there might be something important that I am missing. These are just my findings and I am certain that there exist approaches that work much better in your case. However, a simple atomicAdd approach might be a fast and easy solution to your problem.

You can checkout the full code here: https://pastebin.com/BZYJ2rYe

I hope this is helpful. Cheers, Michael