I'm working on a C++ Windows project that involves FFT calculations on devices using the CUDA 10.1 library.
I have a vector of vectors std::vector<std::vector<std::complex<float>>> unitedVector
and I need to copy its contents to device memory to perform FFT on the GPU.
I have a std::complex<float>* gpu_data
pointer that points to allocated memory on the device using cudaMalloc
.
To initialize the cuFFT plan I use the following code:
cufftHandle plan;
int data_size = unitedVector[0].size(); // Size of data in each vector
int num_channels = unitedVector.size(); // Number of vectors (or channels with data)
std::complex<float>* gpu_data;
cufftPlan1d(&plan, data_size, CUFFT_C2C, num_channels); // Create FFT plan
cudaMalloc((void**)&gpu_data, num_channels * data_size * sizeof(std::complex<float>)); // Allocation of memory on the GPU
After the plan is initialized, I copy the data like this:
cudaMemcpy(gpu_data, unitedVector[0].data(), num_channels * data_size * sizeof(std::complex<float>), cudaMemcpyHostToDevice);
Next, I perform FFT as follows:
cufftResult result = cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(gpu_data), reinterpret_cast<cufftComplex*>(gpu_data), CUFFT_FORWARD);
// Rotate the spectrum (fftshift)
int half_size = data_size / 2;
for (int i = 0; i < num_channels; ++i) {
cudaMemcpy(unitedVector[i].data(), gpu_data + i * data_size + half_size, half_size * sizeof(std::complex<float>), cudaMemcpyDeviceToHost);
cudaMemcpy(unitedVector[i].data() + half_size, gpu_data + i * data_size, half_size * sizeof(std::complex<float>), cudaMemcpyDeviceToHost);
}
// unitedVector now contains the FFT results for each vector
My FFT implementation runs on the device and uses the cuFFT library for this purpose.
Question: How can I copy data from unitedVector to gpu_data correctly, and how can I do FFT for each vector on the device using CUDA? Could you provide example code for this task?
I used the code above to copy the data and do the FFT. However, I am having problems with the FFT results: The first 3-4 times the FFT is calculated correctly (as in Matlab) and then an exception occurs. I suspect that the problem may be with a bad copy of the data or with my FFT implementation. How can I fix this situation?
Try looping over the inner vectors, doing one cudaMemcpy
per block of memory:
int n_bytes = data_size * sizeof(std::complex<float>);
cudaError_t status;
for (int i = 0; i < num_channels; ++i) {
auto to = gpu_data + i * data_size;
auto from = unitedVector[i].data();
status = cudaMemcpy((void *)to, (void *)from, (size_t)n_bytes, cudaMemcpyHostToDevice);
if (status != cudaSuccess)
std::cout<<"CudaMemcpy failed!"<<std::endl;;
}