cudagpucuda-events

Recording elapsed time of CUDA kernels with cudaEventRecord() for multi-GPU program


I have a sparse triangular solver that works with 4 Tesla V100 GPUs. I completed implementation and all things work well in terms of accuracy. However, I am using a CPU timer to calculate elapsed time. I know that the CPU timer is not the perfect choice for calculating elapsed time, since I can use CUDA Events.

But the thing is, I do not know how to implement CUDA Events for multi GPU. As I saw from NVIDIA tutorials, they use events for inter-GPU synchronization, i.e. waiting for other GPUs to finish dependencies. Anyway, I define events like;

cudaEvent_t start_events[num_gpus]
cudaEvent_t end_events[num_gpus]

I can also initialize these events in a loop by setting the current GPU iteratively.

And my kernel execution is like;

 for(int i = 0; i < num_gpus; i++)
 {
     CUDA_FUNC_CALL(cudaSetDevice(i));
     kernel<<<>>>()
 }

 for(int i = 0; i < num_devices; i++)
 {
     CUDA_FUNC_CALL(cudaSetDevice(i));
     CUDA_FUNC_CALL(cudaDeviceSynchronize());
 }

My question is, how should I use these events to record elapsed times for each GPU separately?


Solution

  • You need to create two events per GPU, and record the events before and after the kernel call on each GPU.

    It could look something like this:

    cudaEvent_t start_events[num_gpus];
    cudaEvent_t end_events[num_gpus];
    
    for(int i = 0; i < num_gpus; i++)
     {
         CUDA_FUNC_CALL(cudaSetDevice(i));
         CUDA_FUNC_CALL(cudaEventCreate(&start_events[i]));
         CUDA_FUNC_CALL(cudaEventCreate(&end_events[i]));
     }
    
     for(int i = 0; i < num_gpus; i++)
     {
         CUDA_FUNC_CALL(cudaSetDevice(i));
         // In cudaEventRecord, ommit stream or set it to 0 to record 
         // in the default stream. It must be the same stream as 
         // where the kernel is launched.
         CUDA_FUNC_CALL(cudaEventRecord(start_events[i], stream)); 
         kernel<<<>>>()
         CUDA_FUNC_CALL(cudaEventRecord(end_events[i], stream));
     }
    
     for(int i = 0; i < num_devices; i++)
     {
         CUDA_FUNC_CALL(cudaSetDevice(i));
         CUDA_FUNC_CALL(cudaDeviceSynchronize());
     }
    
     for(int i = 0; i < num_devices; i++)
     {
         //the end_event must have happened to get a valid duration
         //In this example, this is true because of previous device synchronization
         float time_in_ms;
         CUDA_FUNC_CALL(cudaEventElapsedTime(&time_in_ms, start_events[i], end_events[i]));
         printf("Elapsed time on device %d: %f ms\n", i, time_in_ms)
     }
    
    for(int i = 0; i < num_gpus; i++)
     {
         CUDA_FUNC_CALL(cudaSetDevice(i));
         CUDA_FUNC_CALL(cudaEventDestroy(start_events[i]));
         CUDA_FUNC_CALL(cudaEventDestroy(end_events[i]));
     }