Now, I'm using CUDA dynamic parallelism to create the kernel in a kernel function.
In the CUDA document, kernel functions can only be launched a fixed recursion depth because of resource constraints.
But in my project, I want to launch the kernel but parent kernel doesn't need to wait for exiting of child kernel. In other words, they are completely independent.
So is there some ways to launch the kernel in kernel functions but not limited by recursion depth?
I use the cudaDeviceSetLimit()
to set cudaLimitDevRuntimeSyncDepth
but there are still restrictions.
Example:
__global__ void do_something(MyQueue* queue, Task* task) {
// do something ...
task->execute();
// If queue is not empty, pop from it and launch a kernel to execute it
Task* t = queue->pop();
if (t) {
do_something<<<t->gridSize, t->blockSize, t->mem, stream>>>(queue, t);
}
}
-rdc=true -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61
according to my testing, making use of CDP2, it certainly seems possible to launch a kernel that will call sub-kernels for a "long" sequence:
# cat t225.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>
using mt = int;
const mt my_stopc = 32768;
__global__ void k(mt *d, const mt my_stop){
mt c = *d;
if (c < my_stop){
c++;
*d = c;
k<<<1,1,0, cudaStreamTailLaunch>>>(d, my_stop);}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("device: %lu, %s\n", (unsigned long long)c, cudaGetErrorString(err));
}
int main(int argc, char *argv[]){
mt my_stop = my_stopc;
if (argc > 1) my_stop = atol(argv[1]);
mt *d;
cudaMallocManaged(&d, sizeof(d[0]));
*d = 0;
k<<<1,1>>>(d, my_stop);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) std::cout << "host 1: " << cudaGetErrorString(err) << std::endl;
err = cudaDeviceSynchronize();
if (err != cudaSuccess) std::cout << "host 2: " << cudaGetErrorString(err) << std::endl;
std::cout << "iter: " << *d << std::endl;
}
# nvcc -o t225 t225.cu -arch=sm_89 -rdc=true -lcudadevrt -lineinfo
# ./t225
iter: 32768
# ./t225 70000
iter: 70000
#
CUDA 12.2, L4 GPU
If I increase the count to 1000000, then the (parent) kernel completes in about 12 seconds:
# time ./t225 1000000
iter: 1000000
real 0m12.306s
user 0m10.155s
sys 0m2.080s
#