cudanvidiareduction

CUDA Reduction minimum value and index


I implemented a minimum reduce using CUDA 8 by following this great explanation and modifying it

__inline__ __device__ int warpReduceMin(int val) 
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
    {
        int tmpVal = __shfl_down(val, offset);
        if (tmpVal < val)
        {
            val = tmpVal;
        }
    }
    return val;
}

__inline__ __device__ int blockReduceMin(int val) 
{

    static __shared__ int shared[32]; // Shared mem for 32 partial mins
    int lane = threadIdx.x % warpSize;
    int wid = threadIdx.x / warpSize;

    val = warpReduceMin(val);     // Each warp performs partial reduction

    if (lane == 0)
    {
        shared[wid] = val; // Write reduced value to shared memory
    }

    __syncthreads();              // Wait for all partial reductions

    //read from shared memory only if that warp existed
    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;

    if (wid == 0)
    {
        val = warpReduceMin(val); //Final reduce within first warp
    }

    return val;
}

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
    int minVal = INT_MAX;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
        i < N;
        i += blockDim.x * gridDim.x) 
    {
        minVal = min(minVal, in[i]);
    }
    minVal = blockReduceMin(minVal);
    if (threadIdx.x == 0)
    {
        atomicMin(out, minVal);
    }
}

and it works great and I'm getting the minimum value. However, I don't care about the minimum value, only about its index in the original input array.

I tried modifying my code a bit

__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
    {
        int tmpVal = __shfl_down(val, offset);
        if (tmpVal < val)
        {
            *idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here
            val = tmpVal;
        }
    }
    return val;
}

...
blockReduceMin stayed the same only adding idx to function calls
...

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
    int minVal = INT_MAX;
    int minIdx = 0; // Added this
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
        i < N;
        i += blockDim.x * gridDim.x) 
    {
        if (in[i] < minVal)
        {
            minVal = in[i];
            minIdx = i; // Added this
        }
    }
    minVal = blockReduceMin(minVal, &minIdx);
    if (threadIdx.x == 0)
    {
        int old = atomicMin(out, minVal);
        if (old != minVal) // value was updated
        {
            atomicExch(out + 1, minIdx);
        }
    }
}

But it doesn't work. I feel that I'm missing something important and that this is not the way to go about it, but my search turned up no results.


Solution

  • There are several problems here. You need to modify both the warp and block minimum functions to propagate both the minimum value and its index every time a new local minimum is found. Perhaps something like this:

    __inline__ __device__ void warpReduceMin(int& val, int& idx)
    {
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            int tmpVal = __shfl_down(val, offset);
            int tmpIdx = __shfl_down(idx, offset);
            if (tmpVal < val) {
                val = tmpVal;
                idx = tmpIdx;
            }
        }
    }
    
    __inline__ __device__  void blockReduceMin(int& val, int& idx) 
    {
    
        static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins
        int lane = threadIdx.x % warpSize;
        int wid = threadIdx.x / warpSize;
    
        warpReduceMin(val, idx);     // Each warp performs partial reduction
    
        if (lane == 0) {
            values[wid] = val; // Write reduced value to shared memory
            indices[wid] = idx; // Write reduced value to shared memory
        }
    
        __syncthreads();              // Wait for all partial reductions
    
        //read from shared memory only if that warp existed
        if (threadIdx.x < blockDim.x / warpSize) {
            val = values[lane];
            idx = indices[lane];
        } else {
            val = INT_MAX;
            idx = 0;
        }
    
        if (wid == 0) {
             warpReduceMin(val, idx); //Final reduce within first warp
        }
    }
    

    [note: written in browser, never compiled or tested, use at own risk]

    That should leave every block holding it's correct local minimum and index. Then you have a second problem. This:

    int old = atomicMin(out, minVal);
    if (old != minVal) // value was updated
    {
        atomicExch(out + 1, minIdx);
    }
    

    is broken. There is no guarantee that the minimum value and its index will be correctly set in this code. This is because there is no guarantee that both atomic operations have any synchronisation and there is a potential race where one block may correctly overwrite the minimum value of another block, but then have its index overwritten by the block it replaced. The only solution here would be some sort of mutex, or run a second reduction kernel on the results of each block.