c++cudagpu-shared-memoryprefix-sum

CUDA parallel scan algorithm shared memory race condition


I'm reading the book "Programming Massively Parallel Processor" (3rd edition) that presents an implementation of the Kogge-Stone parallel scan algorithm. This algorithm is meant to be run by a single block (this is just a preliminary simplification) and what follows is the implementation.

// X is the input array, Y is the output array, InputSize is the size of the input array
__global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
    __shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
    
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < InputSize)
        XY[threadIdx.x] = X[i];

    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
        __syncthreads();
        if (threadIdx.x >= stride)
            XY[threadIdx.x] += XY[threadIdx.x - stride]; // Race condition here?
    }

    Y[i] = XY[threadIdx.x];
}

Regardless of the way the algorithm works, I'm a bit puzzled by the line XY[threadIdx.x] += XY[threadIdx.x - stride]. Say stride = 1, then the thread with threadIdx.x = 6 will perform the operation XY[6] += XY[5]. However, at the same time the thread with threadIdx.x = 5 will be performing XY[5] += XY[4]. The question is: is there any guarantee that the thread 6 will read the original value of XY[5] instead of XY[5] + XY[4]?. Note that this is not limited to a single warp in which lockstep execution may prevent the race condition.

Thanks


Solution

  • is there any guarantee that the thread 6 will read the original value of XY[5] instead of XY[5] + XY[4]

    No, CUDA provides no guarantee of thread execution order (lockstep or otherwise) and there is nothing in the code to sort that out either.

    By the way, cuda-memcheck and compute-sanitizer are pretty good at identifying shared memory race conditions:

    $ cat t2.cu
    const int SECTION_SIZE = 256;
    __global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
        __shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
    
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < InputSize)
            XY[threadIdx.x] = X[i];
    
        for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
            __syncthreads();
            if (threadIdx.x >= stride)
                XY[threadIdx.x] += XY[threadIdx.x - stride]; // Race condition here?
        }
    
        Y[i] = XY[threadIdx.x];
    }
    
    int main(){
      const int nblk = 1;
      const int sz = nblk*SECTION_SIZE;
      const int bsz = sz*sizeof(float);
      float *X, *Y;
      cudaMallocManaged(&X, bsz);
      cudaMallocManaged(&Y, bsz);
      Kogge_Stone_scan_kernel<<<nblk, SECTION_SIZE>>>(X, Y, sz);
      cudaDeviceSynchronize();
    }
    $ nvcc -o t2 t2.cu -lineinfo
    $ cuda-memcheck ./t2
    ========= CUDA-MEMCHECK
    ========= ERROR SUMMARY: 0 errors
    $ cuda-memcheck --tool racecheck ./t2
    ========= CUDA-MEMCHECK
    ========= ERROR: Race reported between Read access at 0x000001a0 in /home/user2/misc/junk/t2.cu:12:Kogge_Stone_scan_kernel(float*, float*, int)
    =========     and Write access at 0x000001c0 in /home/user2/misc/junk/t2.cu:12:Kogge_Stone_scan_kernel(float*, float*, int) [6152 hazards]
    =========
    ========= RACECHECK SUMMARY: 1 hazard displayed (1 error, 0 warnings)
    $
    

    As you have probably already surmised, you can sort this out by breaking up the read and write operations in the offending line, with a barrier in-between:

    $ cat t2.cu
    const int SECTION_SIZE = 256;
    __global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
        __shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
    
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < InputSize)
            XY[threadIdx.x] = X[i];
    
        for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
            __syncthreads();
            float val;
            if (threadIdx.x >= stride)
                val = XY[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                XY[threadIdx.x] += val;
        }
    
        Y[i] = XY[threadIdx.x];
    }
    
    int main(){
      const int nblk = 1;
      const int sz = nblk*SECTION_SIZE;
      const int bsz = sz*sizeof(float);
      float *X, *Y;
      cudaMallocManaged(&X, bsz);
      cudaMallocManaged(&Y, bsz);
      Kogge_Stone_scan_kernel<<<nblk, SECTION_SIZE>>>(X, Y, sz);
      cudaDeviceSynchronize();
    }
    $ nvcc -o t2 t2.cu -lineinfo
    $ cuda-memcheck --tool racecheck ./t2
    ========= CUDA-MEMCHECK
    ========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
    $