cudathrust

CUDA thrust max_element fails with large index


Using Thrust to find maximum values, I get error what(): extrema failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered when I put large array whose length is close to INT_MAX (2147483647).

I found a similar post in https://github.com/NVIDIA/thrust/issues/1271, and it seems most of the bugs are 'closed'.

Maybe max_element is the one remaining?

Would there be any workaround with Thurst? While I can write my own CUDA kernels to find maximum, I bet it will underperform Thrust.

Below is the test code, and when I keep first 3 digits of INT_MAX (i.e. 2140000000), it works. However, if I keep one additional digit as 2147000000, it fails with the error above.

#include <iostream>
#include <cstdlib>
#include "thrust/device_ptr.h"
#include "thrust/device_vector.h"
#include "thrust/fill.h"
#include "thrust/copy.h"
#include "thrust/extrema.h"

int main() {
  size_t N = std::numeric_limits<int>::max(); // 2147483647
  N = 2147000000;

  std::cout << "N  " << N << std::endl;

  thrust::device_vector<int> dArr(N);
  thrust::fill(dArr.begin(), dArr.end(), 1);

  dArr[N-2] = 2;

  thrust::device_vector<int>::iterator iter=thrust::max_element(dArr.begin(),dArr.end());

  size_t position = iter - dArr.begin();
  int    val = dArr[position];

  std::cout << "max value in dArr  " << val << std::endl;
  std::cout << " - max_index  " << position << std::endl;

  return 0;
}

Solution

  • Here are two possible work-arounds. 1. Use thrust::reduce with a custom comparator. 2. Use cub's version of max_element. Note that cub's public API currently only supports up to int_max elements. thrust::reduce should use the internal cub implementation with the correct index type depending on the size of the input range.

    // nvcc --extended-lambda -arch=sm_80 -std=c++17
    
    #include <iostream>
    #include <cstdlib>
    #include "thrust/device_ptr.h"
    #include "thrust/device_vector.h"
    #include "thrust/fill.h"
    #include "thrust/copy.h"
    #include "thrust/extrema.h"
    
    #include <cub/cub.cuh>
    
    int main() {
      size_t N = std::numeric_limits<int>::max(); // 2147483647
      N = 2147000000;
    
      std::cout << "N  " << N << std::endl;
    
      thrust::device_vector<int> dArr(N);
      thrust::fill(dArr.begin(), dArr.end(), 1);
    
      dArr[N-2] = 2;
    
      // thrust::device_vector<int>::iterator iter=thrust::max_element(dArr.begin(),dArr.end());
    
      // size_t position = iter - dArr.begin();
      // int    val = dArr[position];
    
      // std::cout << "max value in dArr  " << val << std::endl;
      // std::cout << " - max_index  " << position << std::endl;
    
    
      auto result1 = thrust::reduce(
        thrust::make_zip_iterator(
          dArr.begin(),
          thrust::make_counting_iterator<int>(0)
        ),
        thrust::make_zip_iterator(
          dArr.begin(),
          thrust::make_counting_iterator<int>(0)
        ) + N,
        thrust::make_tuple(std::numeric_limits<int>::min(), -1),
        [] __host__ __device__ (const thrust::tuple<int,int>& l, const thrust::tuple<int,int>& r){
          if(thrust::get<0>(l) > thrust::get<0>(r)) return l;
          else return r;
        }
      );
    
      std::cout << thrust::get<0>(result1) << " " << thrust::get<1>(result1) << "\n";
    
      using T = int;
      size_t temp_storage_bytes = 0;
      thrust::device_vector<cub::KeyValuePair<int, T>> dResult2(1);
      cub::DeviceReduce::ArgMax(
        nullptr,
        temp_storage_bytes,
        dArr.data().get(),
        dResult2.data().get(),
        N
      );
    
      thrust::device_vector<char> dTemp(temp_storage_bytes);
    
      cub::DeviceReduce::ArgMax(
        dTemp.data().get(),
        temp_storage_bytes,
        dArr.data().get(),
        dResult2.data().get(),
        N
      );
      cudaDeviceSynchronize();
      cub::KeyValuePair<int, T> result2 = dResult2[0];
      std::cout << result2.value << " " << result2.key << "\n";
    
    
      return 0;
    }