I have to access elements of a very large array in CUDA kernels. The size of arrays could be above INT_MAX in some applications.
Essentially those are taking the form of the following.
__global__ function(double *dArr) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
dArr[index * WIDTH] = ...; // WIDTH is 256 or 512.
}
To my understanding, CUDA variables such as threadIdx.x
are unsigned int
with smaller limits than the usual uint
.
I am trying to cast these CUDA variables into higher type so that they can be used as an index of the large array.
I have tried some, but still my kernels are not working with large arrays. And I cannot even understand the results from the following simple lines of code (it is not even taking large numbers).
#include <cstdio>
__global__ void printIndex() {
printf("blockIdx.x %lu (%d), blockDim.x %lu (%d), threadIdx.x %lu (%d)\n",
blockIdx.x, blockIdx.x, blockDim.x, blockDim.x, threadIdx.x, threadIdx.x);
//printf("blockIdx.x %d, blockDim.x %d, threadIdx.x %d \n", blockIdx.x, blockDim.x, threadIdx.x); // this works fine.
}
int main() {
printIndex<<<2,64>>>();
cudaDeviceSynchronize();
unsigned int ui = 1000;
printf("ui %lu (%d) \n", ui, ui); // this is just for the comparison.
return 0;
}
The reason I chose %lu
is to mimic a certain type of casting to higher type and the result is quite strange. (When I use %u
, it works fine)
blockIdx.x 4294967297 (64), blockDim.x 0 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 4294967297 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 8589934594 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 12884901891 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 17179869188 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 21474836485 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 25769803782 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 30064771079 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 34359738376 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 38654705673 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 42949672970 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 47244640267 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 51539607564 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 55834574861 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 60129542158 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 64424509455 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 68719476752 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 73014444049 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 77309411346 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 81604378643 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 85899345940 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 90194313237 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 94489280534 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 98784247831 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 103079215128 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 107374182425 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 111669149722 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 115964117019 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 120259084316 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 124554051613 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 128849018910 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 133143986207 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 137438953504 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 141733920801 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 146028888098 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 150323855395 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 154618822692 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 158913789989 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 163208757286 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 167503724583 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 171798691880 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 176093659177 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 180388626474 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 184683593771 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 188978561068 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 193273528365 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 197568495662 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 201863462959 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 206158430256 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 210453397553 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 214748364850 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 219043332147 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 223338299444 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 227633266741 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 231928234038 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 236223201335 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 240518168632 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 244813135929 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 249108103226 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 253403070523 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 257698037820 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 261993005117 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 266287972414 (0), threadIdx.x 0 (6)
blockIdx.x 4294967297 (64), blockDim.x 270582939711 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 0 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 4294967297 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 8589934594 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 12884901891 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 17179869188 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 21474836485 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 25769803782 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 30064771079 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 34359738376 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 38654705673 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 42949672970 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 47244640267 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 51539607564 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 55834574861 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 60129542158 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 64424509455 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 68719476752 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 73014444049 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 77309411346 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 81604378643 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 85899345940 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 90194313237 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 94489280534 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 98784247831 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 103079215128 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 107374182425 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 111669149722 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 115964117019 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 120259084316 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 124554051613 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 128849018910 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 133143986207 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 137438953504 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 141733920801 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 146028888098 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 150323855395 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 154618822692 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 158913789989 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 163208757286 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 167503724583 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 171798691880 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 176093659177 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 180388626474 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 184683593771 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 188978561068 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 193273528365 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 197568495662 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 201863462959 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 206158430256 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 210453397553 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 214748364850 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 219043332147 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 223338299444 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 227633266741 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 231928234038 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 236223201335 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 240518168632 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 244813135929 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 249108103226 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 253403070523 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 257698037820 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 261993005117 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 266287972414 (0), threadIdx.x 0 (6)
blockIdx.x 0 (64), blockDim.x 270582939711 (0), threadIdx.x 0 (6)
ui 1000 (1000)
At the last line I see that ui
is printed fine with %lu
, while the prints in the CUDA kernel is very strange. First of all, %lu
and %d
casts are not the same when I am dealing with numbers at most 64. Secondly, even %d
representations are not correct. blockIdx.x
's should be 0 or 1.
Where I got lost? To check the threadIdx.x
stuff with printf
, what would be the proper way to do? And if I were to cast those into higher type which can potentially get above INT_MAX (or UINT_MAX), what would be the proper way of cast?
I added c
tag since this is about printf
in <cstdio>
.
You need to use the correct format string for each variable. It is invalid to use the format string for a 64 bit value and pass a 32 bit value. (who knows what kind of out-of-bounds access is performed this way)
The compiler should warn about this.
argument is incompatible with corresponding format string conversion (expected type "unsigned long" but argument has type "unsigned int")
If you want to print using %lu
, cast the argument to size_t, i.e. (size_t)threadIdx.x.
To compute the global thread id as 64 bit value, one could use
size_t id = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);