I am trying to perform a sum reduction using CUB and 2D arrays of type float/double. Although it works for certain combinations of rows+columns, for relatively larger arrays, I get an illegal memory access error during the last transfer. A minimal example is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cub/device/device_reduce.cuh>
#include "cuda_runtime.h"
#ifdef DP
#define real double
#else
#define real float
#endif
void generatedata(const int num, real* vec, real start, real finish) {
real rrange = finish - start;
for (auto i = 0; i < num; ++i)
vec[i] = rand() / float(RAND_MAX) * rrange + start;
}
real reduce_to_sum(const int num, const real* vec) {
real total = real(0.0);
for (auto i = 0; i < num; ++i)
total += vec[i];
return total;
}
int main() {
int rows = 2001;
int cols = 3145;
size_t msize = rows * cols;
real* data = (real*)malloc(msize * sizeof(real));
if (!data)
return -999;
generatedata(msize, data, 0., 50.);
real ref_sum = reduce_to_sum(msize, data);
real* d_data_in = nullptr;
real* d_data_out = nullptr;
size_t pitch_in, pitch_out;
cudaError_t err = cudaMallocPitch(&d_data_in, &pitch_in, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMallocPitch(&d_data_out, &pitch_out, cols * sizeof(real), rows);
if (err != cudaSuccess) {
printf("data_out :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_in, 0, rows * pitch_in);
if (err != cudaSuccess) {
printf("set data_in :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemcpy2D(d_data_in, pitch_in, data, cols * sizeof(real), cols * sizeof(real), rows, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("copy data :: %s \n", cudaGetErrorString(err));
return -999;
}
void* d_temp = nullptr;
size_t temp_bytes = 0;
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaMalloc(&d_temp, temp_bytes);
if (err != cudaSuccess) {
printf("temp :: %s \n", cudaGetErrorString(err));
return -999;
}
err = cudaMemset(d_data_out, 0, rows * pitch_out);
if (err != cudaSuccess) {
printf("set temp :: %s \n", cudaGetErrorString(err));
return -999;
}
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * pitch_out);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("reduction :: %s \n", cudaGetErrorString(err));
return -999;
}
real gpu_sum = real(0.0);
err = cudaMemcpy(&gpu_sum, d_data_out, sizeof(real), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("copy final :: %s \n", cudaGetErrorString(err));
return -999;
}
printf("Difference in sum (h)%f - (d)%f = %f \n", ref_sum, gpu_sum, ref_sum - gpu_sum);
if (data) free(data);
if (d_data_in) cudaFree(d_data_in);
if (d_data_out) cudaFree(d_data_out);
if (d_temp) cudaFree(d_temp);
cudaDeviceReset();
return 0;
}
The error is thrown at "copy final ::". I am bit confused as to why certain rows x columns work and others don't. I did notice it's the larger values that cause it, but can't get my head around. Any suggestions would be much appreciated.
The 5th parameter of cub::DeviceReduce::Sum should be the number of input elements. However, rows * pitch_out
is the size of the output buffer in bytes.
Assuming pitch_in % sizeof(real) == 0
, the following call may work.
cub::DeviceReduce::Sum(d_temp, temp_bytes, d_data_in, d_data_out, rows * (pitch_in / sizeof(real)));
Also note that cub::DeviceReduce::Sum may return before the reduction is complete. In this case, if any error happened during execution, this error will be reported by cudaMemcpy.