I have a sparse triangular solver that works with 4 Tesla V100 GPUs. I completed implementation and all things work well in terms of accuracy. However, I am using a CPU timer to calculate elapsed time. I know that the CPU timer is not the perfect choice for calculating elapsed time, since I can use CUDA Events.
But the thing is, I do not know how to implement CUDA Events for multi GPU. As I saw from NVIDIA tutorials, they use events for inter-GPU synchronization, i.e. waiting for other GPUs to finish dependencies. Anyway, I define events like;
cudaEvent_t start_events[num_gpus]
cudaEvent_t end_events[num_gpus]
I can also initialize these events in a loop by setting the current GPU iteratively.
And my kernel execution is like;
for(int i = 0; i < num_gpus; i++)
{
CUDA_FUNC_CALL(cudaSetDevice(i));
kernel<<<>>>()
}
for(int i = 0; i < num_devices; i++)
{
CUDA_FUNC_CALL(cudaSetDevice(i));
CUDA_FUNC_CALL(cudaDeviceSynchronize());
}
My question is, how should I use these events to record elapsed times for each GPU separately?
You need to create two events per GPU, and record the events before and after the kernel call on each GPU.
It could look something like this:
cudaEvent_t start_events[num_gpus];
cudaEvent_t end_events[num_gpus];
for(int i = 0; i < num_gpus; i++)
{
CUDA_FUNC_CALL(cudaSetDevice(i));
CUDA_FUNC_CALL(cudaEventCreate(&start_events[i]));
CUDA_FUNC_CALL(cudaEventCreate(&end_events[i]));
}
for(int i = 0; i < num_gpus; i++)
{
CUDA_FUNC_CALL(cudaSetDevice(i));
// In cudaEventRecord, ommit stream or set it to 0 to record
// in the default stream. It must be the same stream as
// where the kernel is launched.
CUDA_FUNC_CALL(cudaEventRecord(start_events[i], stream));
kernel<<<>>>()
CUDA_FUNC_CALL(cudaEventRecord(end_events[i], stream));
}
for(int i = 0; i < num_devices; i++)
{
CUDA_FUNC_CALL(cudaSetDevice(i));
CUDA_FUNC_CALL(cudaDeviceSynchronize());
}
for(int i = 0; i < num_devices; i++)
{
//the end_event must have happened to get a valid duration
//In this example, this is true because of previous device synchronization
float time_in_ms;
CUDA_FUNC_CALL(cudaEventElapsedTime(&time_in_ms, start_events[i], end_events[i]));
printf("Elapsed time on device %d: %f ms\n", i, time_in_ms)
}
for(int i = 0; i < num_gpus; i++)
{
CUDA_FUNC_CALL(cudaSetDevice(i));
CUDA_FUNC_CALL(cudaEventDestroy(start_events[i]));
CUDA_FUNC_CALL(cudaEventDestroy(end_events[i]));
}