In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.
Using complex numbers from cuComplex.h is not a preferable option since I do a lot of arithmetic on the array and cuComplex doesnt have defined operators such as * +=.
This is how I defined array which I want to transpose
thrust::complex<float> u[xmax][xmax];
I have found this https://github.com/jtravs/cuda_complex, but using it as such:
#include "cuComplex.hpp"
doesnt allow me to use mentioned operators when compiled with nvcc
error: no operator "+=" matches these operands
operand types are: cuComplex += cuComplex
Is there some solution to this? Code from github is old and there may lay the issue or maybe I am using it wrong
EDIT: Here is code which works, only difference from talonmies code is adding simple kernel and pointer to same data but being thrust::complex
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
__global__ void test(thrust::complex<double>* u) {
u[0] += thrust::complex<double>(3.3,3.3);
}
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
cout << u[0][0] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
test<<<1,1>>>(d_vTest);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[0][0] << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
Despite your protestations to the contrary, the C++ standard library complex
(or thrust::complex
) most certainly does work with CUBLAS. The cuComplex
and cuDoubleComplex
are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.
A simple modification to the code you posted in comments works exactly as you might imagine:
#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
complex<double> u[xmax][xmax];
size_t arrSize = sizeof(complex<double>) * xmax * xmax;
fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
u[49][51] += complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
complex<double> alpha(1.0, 0.0);
complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
built and run like so:
~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)
The only modifications required are explicit casts of the std::complex<double>
types to cuDoubleComplex
. Do that and everything works as expected.
Use thrust, the code looks almost identical:
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
size_t arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>
__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
u[49 + 51*xmax] *= 2.0;
}
int main()
{
int xmax = 100;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, _d_u, xmax,
_beta, _d_u, xmax,
_d_v, xmax);
thrust::complex<double> u[xmax][xmax];
thrust::copy(d_u.begin(), d_u.end(), &u[0][0]);
std::cout << "Before:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
thrust::copy(d_v.begin(), d_v.end(), &u[0][0]);
std::cout << "After:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
return 0;
}