Using Thrust to find maximum values, I get error what(): extrema failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered
when I put large array whose length is close to INT_MAX (2147483647).
I found a similar post in https://github.com/NVIDIA/thrust/issues/1271, and it seems most of the bugs are 'closed'.
Maybe max_element is the one remaining?
Would there be any workaround with Thurst? While I can write my own CUDA kernels to find maximum, I bet it will underperform Thrust.
Below is the test code, and when I keep first 3 digits of INT_MAX (i.e. 2140000000), it works. However, if I keep one additional digit as 2147000000, it fails with the error above.
#include <iostream>
#include <cstdlib>
#include "thrust/device_ptr.h"
#include "thrust/device_vector.h"
#include "thrust/fill.h"
#include "thrust/copy.h"
#include "thrust/extrema.h"
int main() {
size_t N = std::numeric_limits<int>::max(); // 2147483647
N = 2147000000;
std::cout << "N " << N << std::endl;
thrust::device_vector<int> dArr(N);
thrust::fill(dArr.begin(), dArr.end(), 1);
dArr[N-2] = 2;
thrust::device_vector<int>::iterator iter=thrust::max_element(dArr.begin(),dArr.end());
size_t position = iter - dArr.begin();
int val = dArr[position];
std::cout << "max value in dArr " << val << std::endl;
std::cout << " - max_index " << position << std::endl;
return 0;
}
Here are two possible work-arounds. 1. Use thrust::reduce with a custom comparator. 2. Use cub's version of max_element. Note that cub's public API currently only supports up to int_max elements. thrust::reduce should use the internal cub implementation with the correct index type depending on the size of the input range.
// nvcc --extended-lambda -arch=sm_80 -std=c++17
#include <iostream>
#include <cstdlib>
#include "thrust/device_ptr.h"
#include "thrust/device_vector.h"
#include "thrust/fill.h"
#include "thrust/copy.h"
#include "thrust/extrema.h"
#include <cub/cub.cuh>
int main() {
size_t N = std::numeric_limits<int>::max(); // 2147483647
N = 2147000000;
std::cout << "N " << N << std::endl;
thrust::device_vector<int> dArr(N);
thrust::fill(dArr.begin(), dArr.end(), 1);
dArr[N-2] = 2;
// thrust::device_vector<int>::iterator iter=thrust::max_element(dArr.begin(),dArr.end());
// size_t position = iter - dArr.begin();
// int val = dArr[position];
// std::cout << "max value in dArr " << val << std::endl;
// std::cout << " - max_index " << position << std::endl;
auto result1 = thrust::reduce(
thrust::make_zip_iterator(
dArr.begin(),
thrust::make_counting_iterator<int>(0)
),
thrust::make_zip_iterator(
dArr.begin(),
thrust::make_counting_iterator<int>(0)
) + N,
thrust::make_tuple(std::numeric_limits<int>::min(), -1),
[] __host__ __device__ (const thrust::tuple<int,int>& l, const thrust::tuple<int,int>& r){
if(thrust::get<0>(l) > thrust::get<0>(r)) return l;
else return r;
}
);
std::cout << thrust::get<0>(result1) << " " << thrust::get<1>(result1) << "\n";
using T = int;
size_t temp_storage_bytes = 0;
thrust::device_vector<cub::KeyValuePair<int, T>> dResult2(1);
cub::DeviceReduce::ArgMax(
nullptr,
temp_storage_bytes,
dArr.data().get(),
dResult2.data().get(),
N
);
thrust::device_vector<char> dTemp(temp_storage_bytes);
cub::DeviceReduce::ArgMax(
dTemp.data().get(),
temp_storage_bytes,
dArr.data().get(),
dResult2.data().get(),
N
);
cudaDeviceSynchronize();
cub::KeyValuePair<int, T> result2 = dResult2[0];
std::cout << result2.value << " " << result2.key << "\n";
return 0;
}