memorycudansight-compute

Vectorized Memory Stores Reduce Load Instructions


I have a kernel that is 16x coarsened (1x16 tiling). To reduce the STG (store global) instructions I have implemented vectorized memory accesses via uchar4 in my case. When i took a look at the memory chart I see this :

Scalar Mem Access enter image description here

Vectorized Mem Access

enter image description here

enter image description here How is this even possible ? Global load instructions are reduced.These are the kernels :

    __global__ void k_1D_gf_3x3_vectorized16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
    int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;

    int vals[16] = { 0 };
    unsigned char frame[3][3];

    if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty  < cols - 1)) {
        frame[0][0] = input[(tx - 1) * cols + ty - 1];
        frame[0][1] = input[(tx - 1) * cols + ty];
        frame[0][2] = input[(tx - 1) * cols + ty + 1];
        frame[1][0] = input[tx * cols + ty - 1];
        frame[1][1] = input[tx * cols + ty];
        frame[1][2] = input[tx * cols + ty + 1];
        frame[2][0] = input[(tx + 1) * cols + ty - 1];
        frame[2][1] = input[(tx + 1) * cols + ty];
        frame[2][2] = input[(tx + 1) * cols + ty + 1];

        vals[0] = (global_conv_kernel3x3[0][0] * frame[0][0]
            + global_conv_kernel3x3[0][1] * frame[0][1]
            + global_conv_kernel3x3[0][2] * frame[0][2]
            + global_conv_kernel3x3[1][0] * frame[1][0]
            + global_conv_kernel3x3[1][1] * frame[1][1]
            + global_conv_kernel3x3[1][2] * frame[1][2]
            + global_conv_kernel3x3[2][0] * frame[2][0]
            + global_conv_kernel3x3[2][1] * frame[2][1]
            + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;

        for (int i = 1; i < 16; i++) {
            int _ty = ty + i;
            shift_left(frame);
            if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
                frame[0][2] = input[(tx - 1) * cols + _ty + 1];
                frame[1][2] = input[tx * cols + _ty + 1];
                frame[2][2] = input[(tx + 1) * cols + _ty + 1];

                vals[i] = (global_conv_kernel3x3[0][0] * frame[0][0]
                    + global_conv_kernel3x3[0][1] * frame[0][1]
                    + global_conv_kernel3x3[0][2] * frame[0][2]
                    + global_conv_kernel3x3[1][0] * frame[1][0]
                    + global_conv_kernel3x3[1][1] * frame[1][1]
                    + global_conv_kernel3x3[1][2] * frame[1][2]
                    + global_conv_kernel3x3[2][0] * frame[2][0]
                    + global_conv_kernel3x3[2][1] * frame[2][1]
                    + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
            }
        }
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty)])[0] = make_uchar4(vals[0], vals[1], vals[2], vals[3]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 4)])[0] = make_uchar4(vals[4], vals[5], vals[6], vals[7]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 8)])[0] = make_uchar4(vals[8], vals[9], vals[10], vals[11]);
        reinterpret_cast<uchar4*>(&output[(tx * cols + ty + 12)])[0] = make_uchar4(vals[12], vals[13], vals[14], vals[15]);
    }
}
__global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
    int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 16;
    int tx = blockIdx.y * blockDim.y + threadIdx.y;

    unsigned char frame[3][3];

    if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
        frame[0][0] = input[(tx - 1) * cols + ty - 1];
        frame[0][1] = input[(tx - 1) * cols + ty];
        frame[0][2] = input[(tx - 1) * cols + ty + 1];
        frame[1][0] = input[tx * cols + ty - 1];
        frame[1][1] = input[tx * cols + ty];
        frame[1][2] = input[tx * cols + ty + 1];
        frame[2][0] = input[(tx + 1) * cols + ty - 1];
        frame[2][1] = input[(tx + 1) * cols + ty];
        frame[2][2] = input[(tx + 1) * cols + ty + 1];

        output[(tx * cols + ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
        + global_conv_kernel3x3[0][1] * frame[0][1]
        + global_conv_kernel3x3[0][2] * frame[0][2]
        + global_conv_kernel3x3[1][0] * frame[1][0]
        + global_conv_kernel3x3[1][1] * frame[1][1]
        + global_conv_kernel3x3[1][2] * frame[1][2]
        + global_conv_kernel3x3[2][0] * frame[2][0]
        + global_conv_kernel3x3[2][1] * frame[2][1]
        + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;

        for (int i = 1; i < 16; i++) {
            int _ty = ty + i;
            shift_left(frame);
            if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
                frame[0][2] = input[(tx - 1) * cols + _ty + 1];
                frame[1][2] = input[tx * cols + _ty + 1];
                frame[2][2] = input[(tx + 1) * cols + _ty + 1];

                output[(tx * cols + _ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
                + global_conv_kernel3x3[0][1] * frame[0][1]
                + global_conv_kernel3x3[0][2] * frame[0][2]
                + global_conv_kernel3x3[1][0] * frame[1][0]
                + global_conv_kernel3x3[1][1] * frame[1][1]
                + global_conv_kernel3x3[1][2] * frame[1][2]
                + global_conv_kernel3x3[2][0] * frame[2][0]
                + global_conv_kernel3x3[2][1] * frame[2][1]
                + global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
            }
        }
    }
}

As you see only difference is storing the output elements. As I expected that reduced the global memory accesses by %75 for storing the elements (STG).


Solution

  • Number of global loads are reduced because, in the first version, the compiler cannot confirm that global_conv_kernel3x3 does not overlap with output, on which you write, and therefore has to load again the value to make sure it has not changed.

    In the PTX source for the regular kernel, you can see 2x duplicated global loads:

     ld.global.u64  %rd7, [global_conv_kernel3x3];
    ...
     ld.global.u64  %rd15, [global_conv_kernel3x3];
    

    In order to address this, you can mark global_conv_kernel3x3 as __constant__ to tell the compiler value will never change. I have confirmed locally it does address your issue:

    $ head -n4 test.cu
    __constant__ char** global_conv_kernel3x3;
    
    __global__ void k_1D_gf_3x3_load_balance16_global(unsigned char* input, unsigned char* output, int rows, int cols)
    {
    $ nvcc -ptx test.cu
    $ cat test2.ptx | grep global_conv
    .const .align 8 .u64 global_conv_kernel3x3;
     ld.const.u64  %rd8, [global_conv_kernel3x3];