I'm writing a CUDA-accelerated cellular automata and I want it to run at a high update rate (around 100k updates per second). Is there a way to make consecutive CUDA calls faster?
I tested it with empty loops (256 threads x 3 blocks).
First, I have wrote a simple loop with cuda calls and got 40 thousand iterations per second. Then I changed my setup: I moved the loop into my kernel function and added __syncthreads()
call at the end of the loop body. That increased update rate to 50k per second. Can this be made even faster?
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#define CHUNK_SIZE 16
struct Cell {
uint8_t type;
};
struct Chunk {
uint16_t x, y;
Cell cells[CHUNK_SIZE * CHUNK_SIZE][2];
};
__global__ void update(Chunk *chunks, unsigned long long steps) {
for (unsigned long long i = 0; i < steps; i++) {
// ...
__syncthreads();
}
}
int main(void) {
thrust::host_vector<Chunk> h_vec;
h_vec.push_back(Chunk { 0, 0 });
h_vec.push_back(Chunk { 10, 0 });
h_vec.push_back(Chunk { 10, 12 });
thrust::device_vector<Chunk> d_vec = h_vec;
Chunk *chunks = thrust::raw_pointer_cast(d_vec.data());
clock_t start = clock();
unsigned long long i = 0;
while ((clock() - start) < 5000) {
update<<<d_vec.size(), dim3(CHUNK_SIZE, CHUNK_SIZE)>>>(chunks, 100000);
i += 100000;
}
cudaDeviceSynchronize();
std::cout << (i / float(clock() - start)) << " iterations per second" << std::endl;
return 0;
}
Your benchmarking is flawed. You use clock()
, which in host code refers to the consumed CPU time and does not count time that the CPU sits idle waiting for the GPU. In device code it is some counter with a specific clock rate. What you actually want to measure is the wall-clock time between start and end of the kernel. The C++ construct to do this is the std::chrono::steady_clock
whereas CUDA provides events to measure device-side time.
Here is a modified version of your code that a) properly measures elapsed time and b) uses a stream to keep GPU code asynchronous from the CPU, which improves GPU utilization since there is no round-trip delay between starting kernels. The same can be achieved by compiling with --default-stream per-thread
.
#include <iostream>
__global__ void update(volatile int *chunks, unsigned long long steps) {
const unsigned tid = blockDim.x * blockIdx.x + threadIdx.x;
for (unsigned long long i = 0; i < steps; i++) {
chunks[tid] = i; // volatile to avoid compiler optimization
__syncthreads();
}
}
int main() {
cudaStream_t stream;
if(cudaStreamCreate(&stream))
return 1;
cudaEvent_t start, stop;
if(cudaEventCreate(&start))
return 2;
if(cudaEventCreate(&stop))
return 3;
int* chunks;
unsigned blocks = 3, blocksize = 256;
unsigned long long iterations = 100000;
if(cudaMalloc(&chunks, sizeof(*chunks) * blocks * blocksize))
return 4;
update<<<blocks, blocksize, 0, stream>>>(chunks, iterations); // warmup
if(cudaEventRecord(start, stream))
return 5;
update<<<blocks, blocksize, 0, stream>>>(chunks, iterations);
if(cudaEventRecord(stop, stream))
return 6;
if(cudaEventSynchronize(stop))
return 7;
float milliseconds;
if(cudaEventElapsedTime(&milliseconds, start, stop))
return 8;
std::cout << "loop in kernel: " << (iterations / milliseconds * 1e3)
<< " iterations per second\n";
if(cudaEventRecord(start, stream))
return 9;
for(unsigned long i = 0; i < iterations; ++i)
update<<<blocks, blocksize, 0, stream>>>(chunks, 1);
if(cudaEventRecord(stop, stream))
return 10;
if(cudaEventSynchronize(stop))
return 11;
if(cudaEventElapsedTime(&milliseconds, start, stop))
return 12;
std::cout << "loop around kernel: " << (iterations / milliseconds * 1e3)
<< " iterations per second\n";
return 0;
}
On my laptop (GeForce RTX 4050 Max-Q / Mobile) the loop inside the kernel results in 36e6 iterations per second and the loop around the kernel in 550e3 iterations per second.
Note that of course those two patterns are not equivalent since __syncthreads()
synchronizes only within a single thread block while kernel launches synchronize the whole grid.