cudavolatilereductiongpu-warp

CUDA Reduction: Warp Unrolling (School)


I am currently working on a project in which I am unrolling the last warp of a reduction. I have finished the code above; however, some modifications were done by guessing and I'd like an explanation why. The code I have written is only the function kernel4

// in is input array, out is where to store result, n is number of elements from in
// T is a float (32bit)
__global__ void kernel4(T *in, T *out, unsigned int n)

which is a reduction algorithm, the rest of the code was already provided.

Code:

#include <stdlib.h>
#include <stdio.h>

#include "timer.h"
#include "cuda_utils.h"

typedef float T;

#define N_ (8 * 1024 * 1024)
#define MAX_THREADS 256
#define MAX_BLOCKS 64

#define MIN(x,y) ((x < y) ? x : y)
#define tid threadIdx.x 
#define bid blockIdx.x 
#define bdim blockDim.x
#define warp_size 32

unsigned int nextPow2( unsigned int x ) {
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
    if (whichKernel < 3) {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    } else {
        threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }
    if (whichKernel == 5)
        blocks = MIN(maxBlocks, blocks);
}

T reduce_cpu(T *data, int n) {
    T sum = data[0];
    T c = (T) 0.0;
    for (int i = 1; i < n; i++)
    {
        T y = data[i] - c;
        T t = sum + y;
        c = (t - sum) - y;
        sum = t;
    } 
    return sum;
}

__global__ void
kernel4(T *in, T *out, unsigned int n)
{
    __shared__ volatile T d[MAX_THREADS];

    unsigned int i = bid * bdim + tid;

    n >>= 1;
    d[tid] = (i < n) ? in[i] + in[i+n] : 0;
    __syncthreads ();

    for(unsigned int s = bdim >> 1; s > warp_size; s >>= 1) {
        if(tid < s)
            d[tid] += d[tid + s];
        __syncthreads ();
    }

    if (tid < warp_size) {
        if (n > 64) d[tid] += d[tid + 32];
        if (n > 32) d[tid] += d[tid + 16];
        d[tid] += d[tid + 8];
        d[tid] += d[tid + 4];
        d[tid] += d[tid + 2];
        d[tid] += d[tid + 1];
    }

    if(tid == 0)
        out[bid] = d[0];
}

int main(int argc, char** argv)
{
    T *h_idata, h_odata, h_cpu;
    T *d_idata, *d_odata;   

    struct stopwatch_t* timer = NULL;
    long double t_kernel_4, t_cpu;

    int whichKernel = 4, threads, blocks, N, i;
    if(argc > 1) {
        N = atoi (argv[1]);
        printf("N: %d\n", N);
    } else {
        N = N_;
        printf("N: %d\n", N);
    }

    getNumBlocksAndThreads (whichKernel, N, MAX_BLOCKS, MAX_THREADS, blocks, threads);

    stopwatch_init ();
    timer = stopwatch_create ();

    h_idata = (T*) malloc (N * sizeof (T));
    CUDA_CHECK_ERROR (cudaMalloc (&d_idata, N * sizeof (T)));
    CUDA_CHECK_ERROR (cudaMalloc (&d_odata, blocks * sizeof (T)));

    srand48(time(NULL));
    for(i = 0; i < N; i++)
        h_idata[i] = drand48() / 100000;
    CUDA_CHECK_ERROR (cudaMemcpy (d_idata, h_idata, N * sizeof (T), cudaMemcpyHostToDevice));

    dim3 gb(blocks, 1, 1);
    dim3 tb(threads, 1, 1);

    kernel4 <<<gb, tb>>> (d_idata, d_odata, N);
    cudaThreadSynchronize ();

    stopwatch_start (timer);

    kernel4 <<<gb, tb>>> (d_idata, d_odata, N);
    int s = blocks;
    while(s > 1) {
        threads = 0;
        blocks = 0;
        getNumBlocksAndThreads (whichKernel, s, MAX_BLOCKS, MAX_THREADS, blocks, threads);

        dim3 gb(blocks, 1, 1);
        dim3 tb(threads, 1, 1);

        kernel4 <<<gb, tb>>> (d_odata, d_odata, s);
        s = (s + threads * 2 - 1) / (threads * 2);
    }
    cudaThreadSynchronize ();

    t_kernel_4 = stopwatch_stop (timer);
    fprintf (stdout, "Time to execute unrolled GPU reduction kernel: %Lg secs\n", t_kernel_4);

    double bw = (N * sizeof(T)) / (t_kernel_4 * 1e9);   // total bits / time
    fprintf (stdout, "Effective bandwidth: %.2lf GB/s\n", bw);
    CUDA_CHECK_ERROR (cudaMemcpy (&h_odata, d_odata, sizeof (T), cudaMemcpyDeviceToHost));

    stopwatch_start (timer);
    h_cpu = reduce_cpu (h_idata, N);
    t_cpu = stopwatch_stop (timer);
    fprintf (stdout, "Time to execute naive CPU reduction: %Lg secs\n", t_cpu);

    if(abs (h_odata - h_cpu) > 1e-5)
        fprintf(stderr, "FAILURE: GPU: %f  CPU: %f\n", h_odata, h_cpu);
    else
        printf("SUCCESS: GPU: %f  CPU: %f\n", h_odata, h_cpu);
    return 0;
}

My first question is: when declaring

__shared__ volatile T d[MAX_THREADS];

I would like to verify my understanding of volatile. Volatile prevents compilers from incorrectly optimizing my code and promises that load/stores are completed through the cache and not just registers (please correct me if wrong). For reduction, if partial reduction sums are still stored in registers, why is this a problem?

My second question is: when doing the actual warp reduction

    if (tid < warp_size) { // Final log2(32) = 5 strides
        if (n > 64) d[tid] += d[tid + 32];
        if (n > 32) d[tid] += d[tid + 16];
        d[tid] += d[tid + 8];
        d[tid] += d[tid + 4];
        d[tid] += d[tid + 2];
        d[tid] += d[tid + 1];
    }

The reduction sum will yield incorrect results without (n > 64) and (n > 32) conditions. The results I get are:

FAILURE: GPU: 41.966557  CPU: 41.946209

With 5 trials, the GPU reduction consistently yields an error of 0.0204. I am wary to think this is a floating point operation error.

To be honest as well, my teacher's assistant suggested this change to add the (n > 64) and (n > 32) conditions but did not explain why it would fix the code.

Since n in my trials are over 64, why does this conditional change the results. I am having difficulty tracing back the problem because I cannot use print functions like I would in a CPU.


Solution

  • Let's start with a few preface comments before we tackle your two questions:

    OK, now your questions:

    I would like to verify my understanding of volatile. Volatile prevents compilers from incorrectly optimizing my code and promises that load/stores are completed through the cache and not just registers (please correct me if wrong). For reduction, if partial reduction sums are still stored in registers, why is this a problem?

    Regarding a definition of volatile, I would refer you to the CUDA programming guide. I have seen summary descriptions referring to this as preventing a register optimization or preventing reordering of loads and stores. I prefer the former and will use that as a working definition.

    The basic idea is that volatile forces any reference (read or write) to that variable to actually go to the memory subsystem. By this I mean it will perform a read or write, and will not attempt to use a value previously loaded into a register. Without this qualifier, the compiler is free to load a value once (for example) from the actual memory location, and then maintain that value (and any updates to it) in a register, for as long as it deems appropriate. Compilers do this with an eye toward performance. (As an aside, note that you used the word "cache" here. I would avoid that usage here. Shared memory has no cache interposed between it and the processor load/store mechanism.)

    Without volatile in this type of warp-synchronous coding, we will run into a problem if we allow the compiler to "optimize" (i.e. maintain) intermediate values into registers. This primarily comes about due to inter-thread communication. To see clearly why, let's look at the last 2 steps in your final reduction:

        d[tid] += d[tid + 2];
        d[tid] += d[tid + 1];
    

    Let's consider just threads whose tid values are 0-1. In the second-last step, thread 0 will pick up the d[2] value and add it to the d[0] value, while thread 1 will pick up the d[3] value and add it to the d[1] value. At this point, if we don't use volatile, the compiler is not obligated to write the d[1] value accumulated by thread 1 back out to shared memory. It is allowed to maintain that in a register. So the d[1] value as seen in shared memory is not "up-to-date".

    Now lets go to the last step. In this step, thread 0 reads the d[1] value from shared memory and adds it to the d[0] value. But without volatile, we saw in the previous step that the shared memory contents of d[1] are no longer accurate. OTOH, if we use volatile, then the write to shared memory in the previous step will actually take place, and in the final step, thread 0 will pick up the correct value when it reads d[1]. A CUDA thread is a standalone model. By that, I mean that one thread cannot directly access values contained in registers belonging to another thread. So inter-thread communication at the warp level will normally be accomplished either through shared memory, or via warp-shuffle operations.

    __syncthreads() has a similar behavior: it forces all register-optimized values like this to be written out to memory, so that they are "visible" to other threads in the block. Therefore, a more sophisticated optimization would be to only switch to a volatile qualified pointer when the reduction switches from the loop-driven __syncthreads() based reduction to the final warp-synchronous reduction. You can see an example in the tutorial slides I linked at the beginning of this answer.

    As another aside, warp-synchronous programming of this kind is (more officially) deprecated in CUDA 9. Instead, you should use cooperative groups.

    The reduction sum will yield incorrect results without (n > 64) and (n > 32) conditions.

    These conditionals are primarily used because the code is designed to be "correct" for any block configuration that has a power-of-2 size. If we assume that the block size (number of threads per block) is a power of 2, and greater than 64, it must be 128 or larger for example. Your n variable starts out as the block size, but then gets multiplied by 2:

    n >>= 1;
    

    Therefore, if we want to ensure the correctness of this line of code:

    d[tid] += d[tid + 32];
    

    then we should only apply that operation when the thread block size is 64 (at least) which is like saying that n is greater than 64:

        if (n > 64) d[tid] += d[tid + 32];
    

    regarding this question, the claim is made that the posted code behaves differently if the if (n > 64) is included or not. The reason for this is that the posted code includes a loop which recalculates thread count and block count as the reduction proceeds:

    int s = blocks;
    while(s > 1) {
        threads = 0;
        blocks = 0;
        getNumBlocksAndThreads (whichKernel, s, MAX_BLOCKS, MAX_THREADS, blocks, threads);
    

    This loop eventually results in a block size that is smaller than 128, meaning the omission of the if conditions leads to breakage. (simply print out the threads variable, during this loop).

    regarding this:

    I am having difficulty tracing back the problem because I cannot use print functions like I would in a CPU.

    I'm not sure what the problem is there. printf should work from within kernel code.