i have some cuda code with a 2d kernel function like this:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define row 65
#define col 13824
__global__ void tt(int *pp){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
for(unsigned ig=0;ig<33;ig++){
pp[i*col+j]+=1;
}
return;
}
int main(){
int *pp;
int *rr;
pp=(int*)malloc(sizeof(int)*col*row);
rr=(int*)malloc(sizeof(int)*col*row);
memset(pp,0,sizeof(int)*row*col);
int *pp_g;
cudaMalloc((void**)&pp_g,sizeof(int)*row*col);
cudaMemcpy(pp_g,pp,sizeof(int)*row*col,cudaMemcpyHostToDevice);
dim3 block(32,32,1);
dim3 grid(row/32+1,col/32+1,1);
tt<<<grid,block>>>(pp_g);
cudaDeviceSynchronize();
cudaMemcpy(rr,pp_g,sizeof(int)*row*col,cudaMemcpyDeviceToHost);
int ct=0;
for(unsigned i=0;i<row*col;i++){
if(rr[i]!=33){
//printf("%d\n",rr[i]);
ct++;
}
//printf("%d\n",rr[i]);
}
printf("%d\n",ct);
return 0;
}
the excepted results in rr array should all be 33, however, the actual results are all 0. But when I change block and grid to block(8,8,1)
and grid(row/8+1,col/8+1,1)
, some results in rr
array become correct while there are still 512 wrong results being 0 in rr
array.
I cannot figure out what's wrong with my code. I want to know what happend with my results.
The threads of your kernel perform illegal memory accesses when i >= row
and j >= col
.
Also, when i >= row
and j < col
, the threads perform memory writes concurrently with the "legal" threads.
Checking CUDA errors would have helped you understand the problem. See this post.