In CUDA, How to do a mergesort pass one shared variable into another

I wrote some cuda code that loads from global memory into registers, sorts 16 values. Then in order to merge them, I write them into a shared array:

__global__ void sort16(u32* arr, u32 n) {
__shared__ u32 tmp[16*32]; // 512 elements, 2kB shared memory, hopefully fits?
__shared__ u32 tmp2[16*32]; // 512 elements, 2kB shared memory, hopefully fits?

u32 tid = threadIdx.x;
u32 loc = blockIdx.x * 16;
u32 tmp_loc = tid * 16;
u32 a0 = arr[loc+0], a1 = arr[loc+1], a2 = arr[loc+2], a3 = arr[loc+3],
   a4 = arr[loc+4], a5 = arr[loc+5], a6 = arr[loc+6], a7 = arr[loc+7];
u32 a8 = arr[loc+8], a9 = arr[loc+9], a10 = arr[loc+10], a11 = arr[loc+11],
   a12 = arr[loc+12], a13 = arr[loc+13], a14 = arr[loc+14], a15 = arr[loc+15];
...
tmp[tmp_loc] = a0; tmp[tmp_loc+1] = a1; tmp[tmp_loc+2] = a2; tmp[tmp_loc+3] = a3;
tmp[tmp_loc+4] = a4; tmp[tmp_loc+5] = a5; tmp[tmp_loc+6] = a6; tmp[tmp_loc+7] = a7;
tmp[tmp_loc+8] = a8; tmp[tmp_loc+9] = a9; tmp[tmp_loc+10] = a10; tmp[tmp_loc+11] = a11;
tmp[tmp_loc+12] = a12; tmp[tmp_loc+13] = a13; tmp[tmp_loc+14] = a14; tmp[tmp_loc+15] = a15;
__syncthreads();

But now, I want to merge each block of 16 into 32, then 32 into 64, etc until my shared storage is too small. At the moment, I allocated two arrays of 512 elements. The last pass I want to write to global memory.

I don't understand how to map threads to this operation. I know how to write it sequentially with loops. In C I would write:

__global__ merge2(u32* tmp, u32*tmp2, u32 size) {
  for (u32 a = 0; a < 512; a += 2*size) {
    u32 b = a + size;
    u32 enda = a + size, endb = b + size;
    while (a < enda && b < endb) {
      if (tmp[a] < tmp[b])
        tmp2[c++] = tmp[a++];
      else
        tmp2[c++] = tmp[b++];
    }
    while (a < enda)
      tmp2[c++] = tmp[a++];
    while (b < endb)
      tmp2[c++] = tmp[b++];
  }
}

How can this be expressed in cuda with threads? This is sort of how I would envision the call, though I cannot figure out what the parameters for threads and blocks should be?

u32* t = tmp;
u32* t2 = tmp2;
for (u32 i = 16; i < 512; i *= 2) {
    merge2<<<??, ??>>>(t, t2, i);
    u32* ptrtmp = t;
    t = t2;
    t2 = ptrtmp;
}

One problem is as the blocks grow, the number of threads available to do the merging of each one doubles. I can also just not use them.

Solution

How to CUDA threads share data?

Within a warp: using __shfl instructions: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions

Between warps in the same block:

Using __shared__ memory:

//block <<<1,64>>>
__shared__ int a[64];
const auto mine = threadIdx.x;
a[threadIdx.x] = mine;
__syncthreads(); //important, wait for all threads in the block to finish writing
const auto yours = a[63 - threadIdx.x];
printf("mine: %i, yours: %i\n", mine, yours);

Between blocks

Same as above, but using a global array allocated using cudaMalloc on the host.
(You cannot^H^H should not allocate large blocks of memory on the GPU).

Merge sort
Merge sort on the GPU is typically done by merging bitonic sequences, which are usually sorted by bitonic sorters.
Bitonic sort is not the fastest asymptotically, but it has very low constant overhead and in practice is the fastest way to sort on current GPU hardware.

Below is example code: you can play with it on Godbolt: https://cuda.godbolt.org/z/3nM4Ma6ss

Note that you'll need C++17 or higher to run the code. The nvcc compiler will inline all the calls on the GPU and eliminate all constexpr expressions. The generated assembly is very fast and compact.

NVidia shows a nice demo of bitonic sort on: https://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks

The way bitonic sort works, is that you first take an arbitrary input array and turn this into a bitonic sequence (Step 1 below). You then turn that bitonic sequence into a sorted one.

A bitonic sequence goes like this

  /\
 /  \
/        5 elements, first increasing, then decreasing (or the other way 'round)

Because 32 = 2^5, you need 2x 5 = 10 steps to create a bitonic sequence from a random one and 5 more steps to turn a bitonic sequence into a sorted one.
Once you have 2 sorted sequences (a, B), you reverse the order of B and compare the elements side by side, all the large values go into B' and all the small one into A'.
You now have two new bitonic sequences A' and B' with the nice property that every element in A' <= to every element in B', also, any duplicate elements between A' and B', must be equal in max(A) and min(B), so these are easy to detect.

Graphically:

     /      /       \                       \/                      \  /
    /      /         \                      /\ ->  /                 \
a: /   b: /   rev.b:  \  a' = min(a,rev.b) /  \   / \ b' = max(a,b):

After the minmax partition, you can easily sort A' and B' in only 5 steps (per 32 elements) (i.e.: log_2(n) steps), which is optimal.

The code below demonstrates bitonic merge sort for a single warp (to keep it simple).
If you want to sort bigger arrays, you first let every warp sort it's own section of 32, write the result (in shared mem) and then let each warp combine two sorted sequences as shown below.

If you have large arrays, with small keys, then radix sort may be more efficient than bitonic merge sort.

#include <stdio.h>
#include <cuda.h>

__device__ int lanemask_eq() {
    return 1 << (threadIdx.x & 31);
}

//Bittonic sort on a warp
//https://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
template <typename T>
__device__ float swap(const T x, const int mask, const bool dir) {
    const T y = __shfl_xor_sync(-1u, x, mask);
    const T result = (x < y) == dir ? y : x;
    return result;
}

//Bittonic sort on a warp
//on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
template <typename T, bool reverse = false, int MaxLength = 32>
__device__ T WarpBitonicSortRegisters(T x) {
    constexpr auto rev = uint32_t(-reverse);
    const auto lanebit = lanemask_eq();
    static_assert(sizeof(T) == sizeof(int), "must be int sized");
    x = swap(x, 1, (0x66666666 & lanebit));


    //Part1: create a bitonic sequence
    //see: https://en.wikipedia.org/wiki/Bitonic_sorter
    if constexpr (MaxLength > 4) {
        x = swap(x, 2, (0x3C3C3C3C & lanebit));
        x = swap(x, 1, (0x5A5A5A5A & lanebit));

        if constexpr (MaxLength > 8) {
            x = swap(x, 4, (0x0FF00FF0 & lanebit));
            x = swap(x, 2, (0x33CC33CC & lanebit));
            x = swap(x, 1, (0x55AA55AA & lanebit));

            if constexpr (MaxLength > 16) {
                x = swap(x, 8, (0x00FFFF00 & lanebit));
                x = swap(x, 4, (0x0F0FF0F0 & lanebit));
                x = swap(x, 2, (0x3333CCCC & lanebit));
                x = swap(x, 1, (0x5555AAAA & lanebit));

             //Step 2: turn a bitonic sequence into a sorted one.   

                x = swap(x, 16, ((0xFFFF0000 ^ rev) & lanebit));
            }
            x = swap(x, 8, ((0xFF00FF00 ^ rev) & lanebit));
        }
        x = swap(x, 4, ((0xF0F0F0F0 ^ rev) & lanebit));
    }
    x = swap(x, 2, ((0xCCCCCCCC ^ rev) & lanebit));
    x = swap(x, 1, ((0xAAAAAAAA ^ rev) & lanebit));
    return x;
}

template <typename T, bool reverse = false, int MaxLength = 32>
__device__ T WarpBitonicSortRegisters2(T x) {
    constexpr auto rev = uint32_t(-reverse);
    const auto lanebit = lanemask_eq();
    static_assert(sizeof(T) == sizeof(int), "must be int sized");
    
    //Part1: create a bitonic sequence
    //see: https://en.wikipedia.org/wiki/Bitonic_sorter
    if constexpr (MaxLength > 4) {
        if constexpr (MaxLength > 8) {
            if constexpr (MaxLength > 16) {
                //Step 2: turn a bitonic sequence into a sorted one.   
                x = swap(x, 16, ((0xFFFF0000 ^ rev) & lanebit));
            }
            x = swap(x, 8, ((0xFF00FF00 ^ rev) & lanebit));
        }
        x = swap(x, 4, ((0xF0F0F0F0 ^ rev) & lanebit));
    }
    x = swap(x, 2, ((0xCCCCCCCC ^ rev) & lanebit));
    x = swap(x, 1, ((0xAAAAAAAA ^ rev) & lanebit));
    return x;
}


template <typename T>
__device__ void mergeSort(T& a, T& b) {
    static_assert(sizeof(int) == sizeof(T));
    assert(__shfl_down_sync(-1u, a, 1) >= a);
    assert(__shfl_down_sync(-1u, b, 1) >= b);
    b = __shfl_xor_sync(-1u, b, 31); //reverse the order of B: asc -> desc
    const auto dir = a < b;
    a = dir? a : b;
    b = dir? b : a;
    //every element in A will now be <= to every element in B
    //and both A and B will be bitonic sequences, 
    //meaning we can sort them in only 5 steps
    a = WarpBitonicSortRegisters2(a);
    b = WarpBitonicSortRegisters2(b);
}

__global__ void sortOneWarp(int* ddata) {
    const auto laneid = threadIdx.x & 31;
    const auto Old = ddata[laneid];
    auto A = WarpBitonicSortRegisters(Old);
    const auto Old2 = ddata[laneid + 32];
    auto B = WarpBitonicSortRegisters(Old2);
    mergeSort(A, B);
    printf("tid: %2i, new1: %i, old1: %i\n", threadIdx.x, A, Old);
    printf("tid: %2i, new2: %i, old2: %i\n", threadIdx.x+32, B, Old2);
}



int main() {
  int data[64];
  //for (auto start = 32; auto& i: data) { i = start--; }
  for (auto& i: data) { i = rand(); }
  int* ddata;
  cudaMalloc(&ddata, sizeof(data));
  cudaMemcpy(ddata, &data[0], sizeof(data), cudaMemcpyHostToDevice);
  sortOneWarp<<<1,32>>>(ddata);
  return cudaDeviceSynchronize();
}