I have modified the int version of vector add to two complex vector to add, below code can work, but I am confused:
#include <stdio.h>
#include <complex>
#define N (2048*2048)
#define THREADS_PER_BLOCK 512
__global__ void add(std::complex<double> *a, std::complex<double> *b, std::complex<double> *c)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
// c[index] = a[index] + b[index];
// c[index] = a[index].real();
c[index] = a[index];
}
int main()
{
// host side
std::complex<double> *a;
std::complex<double> *b;
std::complex<double> *c;
// device side
std::complex<double> *d_a;
std::complex<double> *d_b;
std::complex<double> *d_c;
int size = N * sizeof(std::complex<double>);
/* allocate space for device copies of a, b, c */
cudaMalloc( (void **) &d_a, size );
cudaMalloc( (void **) &d_b, size );
cudaMalloc( (void **) &d_c, size );
/* allocate space for host copies of a, b, c and setup input values */
a = (std::complex<double>*)malloc( size );
b = (std::complex<double>*)malloc( size );
c = (std::complex<double>*)malloc( size );
for( int i = 0; i < N; i++ )
{
a[i] = b[i] = i;
c[i] = 0;
}
cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );
add<<< std::ceil(N / (double)THREADS_PER_BLOCK), THREADS_PER_BLOCK>>>( d_a, d_b, d_c );
cudaDeviceSynchronize();
cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost);
bool success = true;
for( int i = 0; i < N; i++ )
{
// if( c[i] != a[i] + b[i])
if( c[i] != a[i] )
{
printf("c[%d] = %d\n",i,c[i] );
success = false;
break;
}
}
printf("%s\n", success ? "success" : "fail");
free(a);
free(b);
free(c);
cudaFree( d_a );
cudaFree( d_b );
cudaFree( d_c );
return 0;
}
for the kernel function:
__global__ void add(std::complex<double> *a, std::complex<double> *b, std::complex<double> *c)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
// c[index] = a[index] + b[index];
// c[index] = a[index].real();
c[index] = a[index];
}
Line
c[index] = a[index];
will call std::complex operator =, this can pass compile, but when change to compile with line:
c[index] = a[index] + b[index]; // first one
c[index] = a[index].real(); // second one
It will just cannot compile, the error message for the first one is:
complex.cu(10): error: calling a host function("std::operator + ") from a global function("add") is not allowed
complex.cu(10): error: identifier "std::operator + " is undefined in device code
the error message when change to use the second one is like:
complex.cu(11): error: calling a constexpr host function("real") from a global function("add") is not allowed. The experimental flag '--expt-relaxed-constexpr' can be used to allow this.
1 error detected in the compilation of "/tmp/tmpxft_000157af_00000000-8_complex.cpp1.ii".
The compile command I used:
/usr/local/cuda-10.2/bin/nvcc -o complex complex.cu
I knew well that device code cannot call host code, and real() and + function for std::complex is both host code, so they cannot be called in kernel function, however, I am not understand why std::complex operator = can pass compile in my kernel function?
update: After overload the operator+ for std::complex, above code can achieve desired result:
__host__ __device__ std::complex<double> operator+(const std::complex<double>& a, const std::complex<double>& b)
{
const double* aArg = reinterpret_cast<const double*>(&a);
const double* bArg = reinterpret_cast<const double*>(&b);
double retVal[2] = { aArg[0] + bArg[0], aArg[1] + bArg[1] };
return std::move(*reinterpret_cast<std::complex<double>*>(retVal));
}
The root cause is that the underline struct of std::complex is in fact a array of 2 data types you defined, like double[2], the benefit is that we can have same function parameter at host/device side. However, I still recommand to use thrust/complex
or other complex library in CUDA.
No, CUDA C++ does not implement std::complex<T>::operator+()
as a built-in.
The std::complex<T>
type is not implemented for the GPU; all of its methods are written host-only. Exceptions to this rule are constexpr
methods, and as @RobertCrovella noted, the compiler willingness to treat some/all implicitly-declared methods as __host__ __device__
- e.g. copy constructors or assignment operators . This is why c[index] = a[index]
works: It uses an implicitly-defined assignment operator.
For using complex numbers on the device side, consider this question: