Wrong results using CUDA streams and memCpyAsync, become correct adding cudaDeviceSynchronize

I'm working on a CUDA matrix multiplication, but I did some modifications to observe how they affect performances.

I want to observe the behavior and performances of a matrix multiplication kernel, making some changes. I'm measuring the changes in GPU events time, I'm testing it in two speicific different conditions:

I have an amount of matrices (say matN) for A, B and C, then I transfer (H2D) one matrix for A, one for B and multply them, to transfer back (D2H) one C;
I have matN for A, B and C, but I transfer >1(say chunk) matrices for A and for B, I compute exactly chunk multiplications, and transfer back chunk result matrices C.

In the first case (chunk = 1) all works as expected, but in the second case (chunk > 1) I get some of Cs are correct, while others are not.

But if I put a cudaDeviceSynchronize() after the cudaMemcpyAsync, I get correct results.

Here's the code doing what I've just described:


/**** main.cpp ****/

    int chunk = matN/iters;    
    #ifdef LOWPAR
        GRIDx= 1;
        GRIDy= 1;
        label="LOW";
    #else
       int sizeX = M;
       int sizeY = N;
       GRIDx = ceil((sizeX)/BLOCK);
       GRIDy = ceil((sizeY)/BLOCK);
       label="";
    #endif

    const int bytesA = M*K*sizeof(float);
    const int bytesB = K*N*sizeof(float);
    const int bytesC = M*N*sizeof(float);

    //device mem allocation
    float *Ad, *Bd, *Cd;
    gpuErrchk( cudaMalloc((void **)&Ad, bytesA*chunk) );
    gpuErrchk( cudaMalloc((void **)&Bd, bytesB*chunk) );
    gpuErrchk( cudaMalloc((void **)&Cd, bytesC*chunk) );
    //host pinned mem allocation
    float *A, *B, *C;
    gpuErrchk( cudaMallocHost((void **)&A, bytesA*matN) );
    gpuErrchk( cudaMallocHost((void **)&B, bytesB*matN) );
    gpuErrchk( cudaMallocHost((void **)&C, bytesC*matN) );

    //host data init
    for(int i=0; i<matN; ++i){
        randomMatrix(M, K, A+(i*M*K));
        randomMatrix(K, N, B+(i*K*N));
    } 

    //event start
    createAndStartEvent(&startEvent, &stopEvent);

    if (square)
    {          
        label += "SQUARE";
        int size = N*N;
        for (int i = 0; i < iters; ++i) { 
            int j = i%nStream;            
            int idx = i*size*chunk;
            newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[j]); 
        }
    }
    else {
        ...
    } 

    msTot = endEvent(&startEvent, &stopEvent);
    #ifdef MEASURES          
        printMeasures(square, label, msTot, millis.count(), matN, iters, devId);
    #else
        float *_A, *_B, *_C, *tmpC;
        tmpC = (float *)calloc(1,bytesC*chunk);
        for (int s=0; s<matN; ++s)
        {
            _A = A+(s*M*K);
            _B = B+(s*K*N);
            _C = C+(s*M*N);
            memset(tmpC, 0, bytesC*chunk);

            hostMatMul(_A, _B, tmpC, M, K, N);
            checkMatEquality(_C, tmpC, M, N);
        }   
    #endif


/**** matmul.cu ****/

__global__ void squareMatMulKernel(float* A, float* B, float* C, int N, int chunk) {

    int ROW = blockIdx.x*blockDim.x+threadIdx.x;
    int COL = blockIdx.y*blockDim.y+threadIdx.y;
  

    if (ROW<N && COL<N) {
        int size=N*N;
        int offs = 0;
        float tmpSum=0.0f;
        
        for (int s=0; s<chunk; ++s)
        {
            offs = s*size;
            tmpSum = 0.0f;
        
            for (int i = 0; i < N; ++i) {
                tmpSum += A[offs+(ROW*N)+i] * B[offs+(i*N)+COL];
            }
        
            C[offs+(ROW*N)+COL] = tmpSum;
        }
    }
    return ;
}




void newSquareMatMulKer(float *A, float *B, float *C, float *Ad, float *Bd, float *Cd, 
            int n, int chunk, cudaStream_t strm)
{
    int size = n*n;
    int bytesMat = size*sizeof(float);

    dim3 dimBlock(BLOCK,BLOCK,1);
    dim3 dimGrid(GRIDx, GRIDy,1); 

    gpuErrchk( cudaMemcpyAsync(Ad, A, bytesMat*chunk, cudaMemcpyHostToDevice, strm) );    
    gpuErrchk( cudaMemcpyAsync(Bd, B, bytesMat*chunk, cudaMemcpyHostToDevice, strm) );   

    #ifdef LOWPAR
        squareMatMulGridStrideKer<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
    #else
        squareMatMulKernel<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
    #endif
    squareMatMulKernel<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);

    gpuErrchk( cudaMemcpyAsync( C, Cd, bytesMat*chunk, cudaMemcpyDeviceToHost, strm) );

    cudaDeviceSynchronize();
        ^ ^ ^ ^ ^ ^
}

I tried to debug using cuda-gdb but nothing strange showed up, gpuErrchk doesn't throw errors in CUDA API calls. I run the code using memcheck too, both with and without cudaDeviceSynchronize and I got no error.

I think it can be a synchronization issue, but I can't understand the reason behind that. Can someone spot where I'm wrong? Other code advices are really appreciated too.

Solution

If you are using multiples streams, you may override Ad and Bd before using them.

Example with iters = 2 and nStream = 2 :

for (int i = 0; i < iters; ++i) { 
  int j = i%nStream;            
  int idx = i*size*chunk;
  newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[j]); 
}

From this loop, you will call

newSquareMatMulKer(A, B, C, Ad, Bd, Cd, N, chunk, stream[0]); // call 0
newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[1]); // call 1

As you are using the same memory area on device for both call, you may have several synchronizations issues:

call 1 start to copy A and B on device before call 0:squareMatMulKernel end, so you may use incorrect values of A and/or B to compute your first iteration.
call 1:squareMatMulKernel start before you retrieve the values of C from call 0, so you may override C with values from call 1.

To fix this problem, I see two approaches:

Using synchronization as in your example with cudaDeviceSynchronize();.
You can allocate more memory two device side (one workspace per stream), for example.

//device mem allocation
float *Ad, *Bd, *Cd;
gpuErrchk( cudaMalloc((void **)&Ad, bytesA*chunk*nStream) );
gpuErrchk( cudaMalloc((void **)&Bd, bytesB*chunk*nStream) );
gpuErrchk( cudaMalloc((void **)&Cd, bytesC*chunk*nStream) );

/* code here */

for (int i = 0; i < iters; ++i) { 
  int j = i%nStream;            
  int idx = i*size*chunk;
  int offset_stream = j*size*chunk;
  newSquareMatMulKer(A+idx, B+idx, C+idx, 
    Ad + offset_stream , 
    Bd + offset_stream , 
    Cd + offset_stream , N, chunk, stream[j]); 
}

In this case you don't need synchronization before the end of the loop.