When trying to find whether shared memory can be accessed by multiple kernels, I have found that sometimes the data in shared memory are still there when accessing by another kernel, but sometimes not. What's more, when debugging the program with cuda-gdb, the data written in shared memory by the previous kernel can be ALWAYS read by the next kernels.
The following is a piece of test code, with 2 GPUs.
extern __shared__ double f_ds[];
__global__ void kernel_writeToSharedMem(double* f_dev, int spd_x)
{
int tid_dev_x = (blockDim.x * blockIdx.x + threadIdx.x);
int tid_dev_y = (blockDim.y * blockIdx.y + threadIdx.y);
int tid_dev = tid_dev_y* spd_x + tid_dev_x;
if(tid_dev < blockDim.x * blockDim.y * gridDim.x*gridDim.y)
f_ds[threadIdx.y*blockDim.x+threadIdx.x] = 0.12345;
__syncthreads()
}
__global__ void kernel_readFromSharedMem(double *f_dev, int dev_no, int spd_x)
{
int tid_dev_x = (blockDim.x * blockIdx.x + threadIdx.x);
int tid_dev_y = (blockDim.y * blockIdx.y + threadIdx.y);
int tid_dev = tid_dev_y* spd_x + tid_dev_x;
if(tid_dev < blockDim.x * blockDim.y * gridDim.x*gridDim.y)
{
f_dev[tid_dev] = f_ds[threadIdx.y*blockDim.x+threadIdx.x];
printf("threadID %d in dev [%d] is having number %f\n",
tid_dev,dev_no,f_ds[threadIdx.y*blockDim.x+threadIdx.x]);
}
__syncthreads();
}
int main()
{
...
dim3 block_size(BLOCK_SIZE,BLOCK_SIZE);
im3 grid_size(spd_x/BLOCK_SIZE,spd_y/BLOCK_SIZE);
for(int i = 0; i < ngpus; i++)
{
cudaSetDevice(i);
kernel_writeToSharedMem<<<grid_size,block_size,sizeof(double)*BLOCK_SIZE*BLOCK_SIZE,stream[i]>>>(f_dev[i],spd_x);
cudaDeviceSynchronize();
cudaThreadSynchronize();
}
for(int i = 0; i < ngpus; i++)
{
cudaSetDevice(i);
kernel_reaFromSharedMem<<<grid_size,block_size,sizeof(double)*BLOCK_SIZE*BLOCK_SIZE,stream[i]>>>(f_dev[i], int i, spd_x);
cudaDeviceSynchronize();
cudaThreadSynchronize();
}
...
}
Four situations occurred after running the program:
Device 0 are 0.12345 but device 1 are 0;
Device 0 are 0 but device 1 are 0.12345;
Device 0 and device 1 are all 0;
Device 0 and device 1 are all 0.12345.
When running in cuda-gdb 4) is always the case.
Does this indicate that the shared memory's persistent is only one kernel? Would shared memory only be cleared or freed after one kernel occasionally?
Shared memory is guaranteed to only have scope for the life of the block to which it is assigned. Any attempt to re-use shared memory from block to block or kernel launch to kernel launch is completely undefined behaviour and should never be relied in a sane code design.