cudareducethrustargmaxgpu-atomics

CUDA, how to find the first item in an array that makes a function maximal


In Cuda C++, I have a big array Arr of integers, and a function F: int -> int. I want to find the first index of some items in Arr that makes F maximal.

How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing any race condition problems?

BTW, I wonder if I can use the functions in the Thrust library for this purpose instead.


Solution

  • How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?

    Based on your description, including usage of int, and a desire to use atomics, I would suggest using a custom atomic. This should work for arrays up to 4 billion elements:

    $ cat t2154.cu
    #include <iostream>
    
    __device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
    __device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
    __device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
      if (fval > fval1) return true;
      if ((fval == fval1) && (idx <= idx1)) return true;
      return false;
    }
    __device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
    
      unsigned long long old = *addr;
      while (!done(get_int(old),fval, get_uns(old), idx))
        old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
      return old;
    }
    const int minidx = 256;
    __device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
    
    __global__ void k(int *arr, unsigned long long *min, unsigned N){
    
      unsigned my_idx = blockIdx.x*blockDim.x+threadIdx.x;
      if (my_idx < N){
        int my_val = arr[my_idx];
        my_val = f(my_val);
        my_custom_atomic(min, my_val, my_idx);
      }
    }
    const unsigned my_N = 32768;
    
    int main(){
    
      unsigned long long *min;
      cudaMallocManaged(&min, sizeof(min[0]));
      int *arr;
      cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
      for (unsigned i = 0; i < my_N; i++) arr[i] = i;
      *min = 0xFFFFFFFF80000000ULL; //maximum unsigned index combined with minimum int value
      k<<<my_N/256, 256>>>(arr, min, my_N);
      cudaDeviceSynchronize();
      std::cout <<  " maximum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
    }
    $ nvcc -o t2154 t2154.cu
    $ compute-sanitizer ./t2154
    ========= COMPUTE-SANITIZER
     maximum val: 256 at index: 256
    ========= ERROR SUMMARY: 0 errors
    $
    

    We are assembling and disassembling the 64-bit quantity as needed, and using the general method outlined in the programming guide for arbitrary atomics.

    I wonder if I can use the functions in Thrust library for this purpose instead.

    Yes, you can do this with a transform and a reduce operation in thrust. In fact thrust can combine these into a single algorithm call. Here is an example:

    $ cat t2155.cu
    #include <iostream>
    #include <thrust/transform.h>
    #include <thrust/reduce.h>
    #include <thrust/device_vector.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/sequence.h>
    #include <limits>
    
    const int minidx = 256;
    
    const size_t my_N = 32768;
    
    struct f
    {
      template <typename T>
      __host__ __device__ T operator()(T t) {
        T result = t;
        thrust::get<0>(result) = minidx - (thrust::get<0>(t) - minidx)*(thrust::get<0>(t) - minidx);
        return result;
      }
    };
    
    struct r
    {
      template <typename T1, typename T2>
      __host__ __device__ T1 operator()(T1 &t1, T2 &t2){
      if (thrust::get<0>(t1) > thrust::get<0>(t2)) return t1;
      if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t2;
      if (thrust::get<1>(t1) < thrust::get<1>(t2)) return t1;
      return t2;
      }
    };
    
    int main(){
    
      thrust::device_vector<int> arr(my_N);
      thrust::sequence(arr.begin(), arr.end());
      auto my_zip = thrust::make_zip_iterator(thrust::make_tuple(arr.begin(), thrust::counting_iterator<size_t>(0)));
      auto init = thrust::make_tuple(std::numeric_limits<int>::min(), std::numeric_limits<size_t>::max());
      auto result = thrust::transform_reduce(my_zip, my_zip+my_N, f(), init, r());
      std::cout <<  " maximum val: " << thrust::get<0>(result) << " at index: " << thrust::get<1>(result) << std::endl;
    }
    $ nvcc -o t2155 t2155.cu
    $ compute-sanitizer ./t2155
    ========= COMPUTE-SANITIZER
     maximum val: 256 at index: 256
    ========= ERROR SUMMARY: 0 errors
    $
    

    Notes:

    EDIT: As suggested in the comments below, we may be able to do a better job in the atomic case, by doing a threadblock level shared sweep reduction, followed by a single atomic per threadblock. Here is an example of that:

    #include <iostream>
    const int nTPB=512;
    const unsigned long long initval = 0xFFFFFFFF80000000ULL; // maximum  index and minimum int
    
    __device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
    
    __device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
    
    __device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
      if (fval > fval1) return true;
      if ((fval == fval1) && (idx <= idx1)) return true;
      return false;
    }
    
    __device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
    
      unsigned long long old = *addr;
      while (!done(get_int(old),fval, get_uns(old), idx))
        old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
      return old;
    }
    const int minidx = 256;
    
    __device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
    
    __device__ unsigned long long my_reduce(unsigned long long t1, unsigned long long t2){
      if (done(get_int(t1), get_int(t2), get_uns(t1), get_uns(t2))) return t1;
      return t2;
    }
    __global__ void k(int *arr, unsigned long long *min, unsigned N){
    
      __shared__ unsigned long long smem[nTPB];
      smem[threadIdx.x] = initval;
      for (unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x; idx < N; idx+=gridDim.x*blockDim.x)
          smem[threadIdx.x] = my_reduce(smem[threadIdx.x], (((unsigned long long)idx)<<32)|f(arr[idx]));
      for (int t = nTPB>>1; t > 0; t>>=1){
          __syncthreads();
          if (threadIdx.x < t) smem[threadIdx.x] = my_reduce(smem[threadIdx.x], smem[threadIdx.x+t]);}
      if (!threadIdx.x) my_custom_atomic(min, get_int(smem[0]), get_uns(smem[0]));
    }
    
    const unsigned my_N = 32768;
    
    int main(){
    
      unsigned long long *min;
      cudaMallocManaged(&min, sizeof(min[0]));
      int *arr;
      cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
      for (unsigned i = 0; i < my_N; i++) arr[i] = i;
      arr[1024] = minidx;
      *min = initval;
      k<<<(my_N+nTPB-1)/nTPB, nTPB>>>(arr, min, my_N);
      cudaDeviceSynchronize();
      std::cout <<  " minimum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
    }