I have a struct that contains an array, and I want to copy the contents from an instance of that struct in CPU memory to another instance in GPU memory.
My question is similar to this one. There are two big difference between this question and the one from the link:
In attempt to answer my own question, I tried modifying the code in the answer as follows:
#include <stdio.h>
#include <stdlib.h>
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
struct Test {
char array[5];
};
__global__ void kernel(Test *dev_test) {
for(int i=0; i < 5; i++) {
printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
}
}
__device__ Test dev_test; //dev_test is now global, statically allocated, and one instance of the struct
int main(void) {
int size = 5;
Test test; //test is now statically allocated and one instance of the struct
char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
memcpy(test.array, temp, size * sizeof(char));
cudaCheckError();
cudaMemcpy(&dev_test, &test, sizeof(Test), cudaMemcpyHostToDevice);
cudaCheckError();
kernel<<<1, 1>>>(&dev_test);
cudaCheckError();
cudaDeviceSynchronize();
cudaCheckError();
// memory free
return 0;
}
But this code throws a runtime error:
Cuda error: HelloCUDA.cu:34: invalid argument
Is there a way to copy test
into dev_test
?
When using a statically allocated __device__
variable:
We don't use the cudaMemcpy
API. We use the cudaMemcpyToSymbol
(or cudaMemcpyFromSymbol
) API
We don't pass __device__
variables as kernel arguments. They are at global scope. You just use them in your kernel code.
The following code has these issues addressed:
$ cat t10.cu
#include <stdio.h>
#define cudaCheckError() { \
cudaError_t err = cudaGetLastError(); \
if(err != cudaSuccess) { \
printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
struct Test {
char array[5];
};
__device__ Test dev_test; //dev_test is now global, statically allocated, and one instance of the struct
__global__ void kernel() {
for(int i=0; i < 5; i++) {
printf("Kernel[0][i]: %c \n", dev_test.array[i]);
}
}
int main(void) {
int size = 5;
Test test; //test is now statically allocated and one instance of the struct
char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
memcpy(test.array, temp, size * sizeof(char));
cudaCheckError();
cudaMemcpyToSymbol(dev_test, &test, sizeof(Test));
cudaCheckError();
kernel<<<1, 1>>>();
cudaCheckError();
cudaDeviceSynchronize();
cudaCheckError();
// memory free
return 0;
}
$ nvcc -o t10 t10.cu
$ cuda-memcheck ./t10
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$
(your array usage in kernel code also didn't make sense. dev_test
is not an array, therefore you cannot index into it: dev_test[0]....
)