CUDA unified memory how to prefetch from device to host?

#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};


int N = 10000000;

void GenData(int N, float* a)
{
  for (int i = 0; i < N; i ++) a[i] = float(rand() % 1000000) / (rand() % 100 + 1);
}

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

constexpr int npoints = 6;
const string costnames[] = {"allocate", "H2D", "sort", "D2H", "hostsum", "free"};
double cost[3][npoints];
volatile double dummy = 0;

void Test1()
{
  MyTimer timer;

  timer.startCounter();
  float *h_a = new float[N];
  float *d_a;
  cudaMalloc(&d_a, N * sizeof(float));
  cudaDeviceSynchronize();
  cost[0][0] += timer.getCounterMsPrecise();

  GenData(N, h_a);
  dummy = h_a[rand() % N];

  timer.startCounter();
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();
  cost[0][1] += timer.getCounterMsPrecise();

  timer.startCounter();
  thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(d_a);
  thrust::sort(dev_ptr, dev_ptr + N);
  cudaDeviceSynchronize();
  cost[0][2] += timer.getCounterMsPrecise();

  timer.startCounter();
  cudaMemcpy(h_a, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);  
  cudaDeviceSynchronize();
  dummy = h_a[rand() % N];
  cost[0][3] += timer.getCounterMsPrecise();
  
  timer.startCounter();
  float sum = 0;
  for (int i = 0; i < N; i++) sum += h_a[i];
  dummy = sum;
  cost[0][4] += timer.getCounterMsPrecise();

  timer.startCounter();
  delete[] h_a;
  cudaFree(d_a);
  cudaDeviceSynchronize();
  cost[0][5] += timer.getCounterMsPrecise();

  for (int i = 0; i < npoints; i++) dummy += cost[0][i];
}

void Test2()
{
  MyTimer timer;

  timer.startCounter();
  float *a;
  cudaMallocManaged(&a, N * sizeof(float));  
  cost[1][0] += timer.getCounterMsPrecise();

  GenData(N, a);
  dummy = a[rand() % N];

  timer.startCounter();
  cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  cudaDeviceSynchronize();
  cost[1][1] += timer.getCounterMsPrecise();

  timer.startCounter();
  thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
  thrust::sort(dev_ptr, dev_ptr + N);
  cudaDeviceSynchronize();
  cost[1][2] += timer.getCounterMsPrecise();

  timer.startCounter();
  cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  cudaDeviceSynchronize();
  dummy = a[rand() % N];
  cost[1][3] += timer.getCounterMsPrecise();

  timer.startCounter();
  float sum = 0;
  for (int i = 0; i < N; i++) sum += a[i];
  dummy = sum;
  cost[1][4] += timer.getCounterMsPrecise();

  timer.startCounter();  
  cudaFree(a);
  cudaDeviceSynchronize();
  cost[1][5] += timer.getCounterMsPrecise();

  for (int i = 0; i < npoints; i++) dummy += cost[1][i];
}

void Test3()
{
  MyTimer timer;

  timer.startCounter();
  float *a;
  cudaMallocManaged(&a, N * sizeof(float));  
  cost[2][0] += timer.getCounterMsPrecise();

  GenData(N, a);
  dummy = a[rand() % N];

  timer.startCounter();
  //cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  //cudaDeviceSynchronize();
  cost[2][1] += timer.getCounterMsPrecise();

  timer.startCounter();
  thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
  thrust::sort(dev_ptr, dev_ptr + N);
  cudaDeviceSynchronize();
  cost[2][2] += timer.getCounterMsPrecise();

  timer.startCounter();
  // cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  // cudaDeviceSynchronize();
  dummy = a[rand() % N];
  cost[2][3] += timer.getCounterMsPrecise();

  timer.startCounter();
  float sum = 0;
  for (int i = 0; i < N; i++) sum += a[i];
  dummy = sum;
  cost[2][4] += timer.getCounterMsPrecise();

  timer.startCounter();  
  cudaFree(a);
  cudaDeviceSynchronize();
  cost[2][5] += timer.getCounterMsPrecise();

  for (int i = 0; i < npoints; i++) dummy += cost[2][i];
}

int main()
{
  srand(time(NULL));
  HelloWorld<<<1,1>>>();

  // warmup
  Test1();
  Test2();
  for (int i = 0; i < 3; i++)
  for (int j = 0; j < npoints; j++) cost[i][j] = 0;  

  int ntest = 10;
  for (int t = 1; t <= ntest; t++) {
    Test1();
    Test2();
    Test3();
  }

  for (int i = 0; i < npoints; i++) {
    cout << "cost " << costnames[i] << " = " << (cost[0][i] / ntest) << " , " << (cost[1][i] / ntest) << " , " << (cost[2][i] / ntest) << "\n";
  }

  return 0;
}

// 2080ti
// Hello world
// cost allocate = 0.245438 , 0.0470603 , 0.029834
// cost H2D = 6.25315 , 6.36215 , 3.71e-05
// cost sort = 2.61625 , 2.6077 , 14.5418
// cost D2H = 8.74573 , 0.0520719 , 0.0759482
// cost hostsum = 6.98815 , 17.9619 , 18.3188
// cost free = 2.82205 , 3.8711 , 4.12887

I'm trying to compare performance of cudaMalloc vs cudaMallocManaged. The use case is for a matrix library, where the use of GPU is hidden from the user (i.e they can just use it like a normal library, but some operations will automatically use GPU).

If an algo only uses the GPU, then we can cudaMemPrefetch the memory to the GPU. You can see that cost[0][2] == cost[1][2], and cost[0][3] is much slower. However, prefetch doesn't work in the opposite direction, so cost[1][4] > cost[0][3] + cost[0][4], ~10-15% slower.

So, is there any way to prefetch unified memory from device to host?

Solution

Your usage of cudaMemPrefetchAsync to prefetch data to host is incorrect. As per API documentation:

Passing in cudaCpuDeviceId for dstDevice will prefetch the data to host memory

Running your code as is, I observe the following output on my machine.

Hello world
cost allocate = 0.190719 , 0.0421818 , 0.0278854
cost H2D = 3.29175 , 5.30171 , 4.3e-05
cost sort = 0.619405 , 0.59198 , 11.6026
cost D2H = 3.42561 , 0.730888 , 0.729142
cost hostsum = 7.34508 , 12.7422 , 12.9242
cost free = 2.20156 , 5.1042 , 5.99327

When I use cudaCpuDeviceId to prefetch the sorted data to the host, the hostsum time decreases.

Hello world
cost allocate = 0.192218 , 0.0414427 , 0.0268805
cost H2D = 3.21791 , 5.31319 , 5e-05
cost sort = 0.617812 , 0.594804 , 12.6862
cost D2H = 3.3481 , 2.9555 , 0.730368
cost hostsum = 7.23154 , 7.20661 , 12.737
cost free = 2.101 , 5.22388 , 5.8554