If I call cudaMemcpy from host memory to host memory, will it first synchronize the device? Is there any difference between the cuda memcpy call and the ordinary C++ function memcpy? I know that in case I want to do a memcpy 2D between host to host, I have to use the cuda call, since there is no such function in C++. Is there any other ones?
If I call cudaMemcpy from host memory to host memory, will it first synchronize the device?
I verified that cudaMemcpy()
with cudaMemcpyHostToHost
does synchronize with the following code:
#include <cuda.h>
#define check_cuda_call(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
__device__ clock_t offset;
__global__ void clock_block(clock_t clock_count)
{
clock_t start_clock = clock();
clock_t clock_offset = 0;
while (clock_offset < clock_count) {
clock_offset = clock() - start_clock;
}
offset = clock_offset;
}
int main(int argc, char *argv[])
{
int *A;
check_cuda_call(cudaMallocHost(&A, 1 * sizeof(int)));
int *B;
check_cuda_call(cudaMallocHost(&B, 1 * sizeof(int)));
clock_block<<<1,1>>>(1000 * 1000 * 1000);
//check_cuda_call(cudaDeviceSynchronize());
check_cuda_call(cudaMemcpy(&A, &B, 1 * sizeof(int), cudaMemcpyHostToHost));
}
With a blocking call after the kernel launch, the app waits for around 1 second on my card. Without a blocking call, it exits immediately.
Is there any difference between the cuda memcpy call and the ordinary C++ function memcpy?
Yes, the synchronization, which also causes the cudaMemcpy()
with cudaMemcpyHostToHost
to be able to return errors from previous async calls, makes it different from plain memcpy()
.
I know that in case I want to do a memcpy 2D between host to host, I have to use the cuda call, since there is no such function in C++. Is there any other ones?
You might be able to use cudaMemcpyAsync()
with cudaMemcpyHostToHost
to do copies on the host without blocking the CPU, but I haven't tested it.