c++cudaprintf

confused about printf buffering rule in CUDA global function


The code below always prints out "Hello from the start" before anything else, and "Hello from the end" after anything else, why is that?

Code:

#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"


__global__ void add1InGPU( int *devArr, int n, int nx )
{
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;
    int id = nx * iy + ix;

    int whateverNum = 5;

    if (id == whateverNum) printf("Hello from the start\n");
    
    if ( id < n ) {
        devArr[id] += 1; 
        printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d, processing index %d\n", blockIdx.x, threadIdx.x, blockIdx.y, threadIdx.y, id );
    }
    else {
        printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d, skipped\n", blockIdx.x, threadIdx.x, blockIdx.y, threadIdx.y);
    }

    if (id == whateverNum) printf("Hello from the end\n");
}

int main(void)
{
    
    int* d_arr = NULL;
    cudaMalloc(&d_arr, 16 * sizeof(int));
    cudaMemset(d_arr, 0, 16 * sizeof(int));

    int nElem = 16;
    dim3 block( 2, 2 );
    dim3 grid( 2, 2 );
    // dim3 grid( ( nElem + block.x - 1 ) / block.x );
    
    add1InGPU<<<grid, block>>> ( d_arr, nElem, 4 );
    cudaDeviceSynchronize();
    int *h_arr = (int*)malloc(16 * sizeof(int));
    cudaMemcpy(h_arr, d_arr, 10 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_arr);

    volatile int _ = 0;
    for (int i = 0; i < 10; i++) {
        _ += h_arr[i];
    }

    cudaDeviceReset();
    
    return 0;
}

Output:

Hello from the start
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 0, processing index 2
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 0, processing index 3
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 1, processing index 6
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 1, processing index 7
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 0, processing index 8
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 0, processing index 9
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 1, processing index 12
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 1, processing index 13
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 0, processing index 10
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 0, processing index 11
blockIdx.x: 1, threadIdx.x: 0, blockIdx.y: 1, threadIdx.y: 1, processing index 14
blockIdx.x: 1, threadIdx.x: 1, blockIdx.y: 1, threadIdx.y: 1, processing index 15
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 0, processing index 0
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 0, processing index 1
blockIdx.x: 0, threadIdx.x: 0, blockIdx.y: 0, threadIdx.y: 1, processing index 4
blockIdx.x: 0, threadIdx.x: 1, blockIdx.y: 0, threadIdx.y: 1, processing index 5
Hello from the end

Solution

  • confused about printf buffering rule in CUDA global function

    Your confusion arises because there is no rule. The scheduling and execution order of blocks and the warps of threads (or even individual threads on some architectures) which constitute those blocks is undefined.

    The code below always prints out "Hello from the start" before anything else

    It doesn't always print in that order. Rather, you have only observed it printing in that order. If you ran the kernel with a significantly larger grid size and/or different hardware, you may see different results. This is the nature of undefined behaviour. Sometimes it appears perfectly predictable even when it is not.