cudabenchmarkingnvidiaunified-memory

Cuda Unified memory vs cudaMalloc


I am trying to do some benchmarking to ensure using CUDA's Unified Memory(UM) approach will not hurt us wrt performance.

I am performing an FFT. One way i use UM, one way i use the cudaMalloc

I compare the results afterwards and they all match up (which is good).

however, the timing i'm getting for the UM approach is ~.5ms vs the cudaMalloc way of ~.04 (after performing the run multiple times an averaging)

I am using Event records to do the timing. I have one right before and after the cufftExecC2C call.

Furthermore, I added two more event records to measure the time before any memory transfer to the device, and after using the data once i get it back from the device.

when doing this, i see the UM approach take ~1.6ms and the cudaMalloc approach taking ~.7.

Below is a snippet of code that does the UM approach:

cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 1);
cudaDeviceSynchronize();

cudaEventRecord(start_kernel);

cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um += std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";

cudaFree(outData);
cudaFree(inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

The following is for the cudaMalloc approach

cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
                                        cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);

cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
                cudaMemcpyDefault);
cudaEventRecord(stop_after_memDtoH);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um += std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";

cudaFree(outData);
cudaFree(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

Is there something else I could be doing when using the unified memory approach to speed it up? I expected UM to be slower, but not by this much.

We are using the P100 on redhat 7.3 with Cuda 9


Solution

  • One problem with your posted code is that you are not doing a cudaMemPrefetchAsync on the output data from the FFT. According to my testing, this makes a significant difference. There were a few other problems with your code, for example we do not call cudaFree on a pointer allocated with malloc.

    Here's a complete code built around what you have shown. When I run this on CentOS7.4, CUDA 9.1, Tesla P100, I get comparable times for the FFT performed in the managed memory case (3.52ms) vs. the FFT performed in the non-managed memory case (3.45ms):

    $ cat t43.cu
    #include <cufft.h>
    #include <iostream>
    #include <string>
    
    //using namespace std;
    const int dataSize  = 1048576*32;
    void setupWave(const int ds, cufftComplex *d){
      for (int i = 0; i < ds; i++){
        d[i].x = 1.0f;
        d[i].y = 0.0f;}
    }
    int main(){
    
    cufftComplex *inData, *outData;
    
    cufftHandle plan;
    cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
    
    cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
    cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
    
    cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                    stop_after_memDtoH;
    cudaEventCreate(&start_kernel);
    cudaEventCreate(&start_before_memHtoD);
    cudaEventCreate(&stop_kernel);
    cudaEventCreate(&stop_after_memDtoH);
    
    setupWave(dataSize, inData);
    
    cudaEventRecord(start_before_memHtoD);
    cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 0);
    cudaMemPrefetchAsync(outData, dataSize * sizeof(cufftComplex), 0);
    cudaDeviceSynchronize();
    
    cudaEventRecord(start_kernel);
    
    cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
    
    cudaEventRecord(stop_kernel);
    
    cudaEventSynchronize(stop_kernel);
    
    float sum = 0;
    for (int i = 0; i < dataSize; i++) {
            sum += outData[i].x + outData[i].y;
    }
    cudaEventRecord(stop_after_memDtoH);
    cudaEventSynchronize(stop_after_memDtoH);
    
    std::cout << "sum for UM is " << sum << std::endl;
    
    float umTime = 0;
    float overallUmTime = 0;
    cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
    cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                    stop_after_memDtoH);
    
    std::string resultString_um = std::to_string(dataSize) + " samples took " + std::to_string(umTime) + "ms,  Overall: " + std::to_string(overallUmTime) + "\n";
    
    std::cout << resultString_um;
    cudaEventDestroy(start_kernel);
    cudaEventDestroy(stop_kernel);
    cudaFree(inData);
    cudaFree(outData);
    cudaEventDestroy(start_before_memHtoD);
    cudaEventDestroy(stop_after_memDtoH);
    
    cufftDestroy(plan);
    
    
    
    cufftComplex *d_inData;
    cufftComplex *d_outData;
    inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
    outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
    cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
    cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
    //cufftHandle plan;
    cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
    
    //cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
    //                stop_after_memDtoH;
    cudaEventCreate(&start_kernel);
    cudaEventCreate(&start_before_memHtoD);
    cudaEventCreate(&stop_kernel);
    cudaEventCreate(&stop_after_memDtoH);
    
    setupWave(dataSize, inData);
    
    cudaEventRecord(start_before_memHtoD);
    cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
                                            cudaMemcpyHostToDevice);
    cudaEventRecord(start_kernel);
    
    cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
    
    cudaEventRecord(stop_kernel);
    
    cudaEventSynchronize(stop_kernel);
    
    cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
                    cudaMemcpyDefault);
    
     sum = 0;
    for (int i = 0; i < dataSize; i++) {
            sum += outData[i].x + outData[i].y;
    }
    cudaEventRecord(stop_after_memDtoH);
    cudaEventSynchronize(stop_after_memDtoH);
    
    std::cout << "sum for non-UM is " << sum << std::endl;
    
    //float umTime = 0;
    //float overallUmTime = 0;
    cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
    cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                    stop_after_memDtoH);
    
    resultString_um = std::to_string(dataSize) + " samples took "
                    + std::to_string(umTime) + "ms,  Overall: "
                    + std::to_string(overallUmTime) + "\n";
    std::cout << resultString_um;
    free(outData);
    free(inData);
    cudaFree(d_outData);
    cudaFree(d_inData);
    cudaEventDestroy(start_kernel);
    cudaEventDestroy(stop_kernel);
    
    cudaEventDestroy(start_before_memHtoD);
    cudaEventDestroy(stop_after_memDtoH);
    
    cufftDestroy(plan);
    
    }
    $ nvcc -std=c++11 -arch=sm_60 -o t43 t43.cu -lcufft
    $ ./t43
    sum for UM is 3.35544e+07
    33554432 samples took 3.520640ms,  Overall: 221.909988
    sum for non-UM is 3.35544e+07
    33554432 samples took 3.456160ms,  Overall: 278.099426
    $