Global arbitrary length integer incrementation in CUDA

In my CUDA program, every thread increments global (__device__) integer value and uses it for further calculations - every thread needs their own, unique value. I've used atomicAdd with local value

local_count = atomicAdd(&global_count, 1);

for this task which worked great until I needed to store this number as 256 bit integer rather than simple u32. I changed this global value into an array of eight u32, which needs rollover support. In CPU programming, this problem is straightforward, but in CUDA each thread might access this global value at the same time, which renders classic CPU implementation useless. I've tried with atomicInc and atomicCAS but the problem is still race conditions between threads as loading and/or incrementing upper u32 values is needed. What is the best way to tackle this problem?

Solution

One possible approach would be to use a critical section controlled by a semaphore as discussed here.

Here is a simple example based on that:

# cat t146.cu
#include <cuda/semaphore>
#include <iostream>
#include <cstdio>

__device__ cuda::binary_semaphore<cuda::thread_scope_device> s(1);
const int w = 4;
__device__ void my_inc(unsigned *d){

  bool done = false;
  int i = 0;
  while (!done){
    if (d[i] < 0xFFFFFFFFU) {d[i]++; done = true;}
    else {d[i] = 0; i++;}
    if (i >= w) done = true;}
}

__global__ void k(unsigned *d){

  s.acquire();
  my_inc(d);
  __threadfence();
  s.release();
}


int main(){

  unsigned *d;
  cudaMallocManaged(&d, sizeof(d[0])*w);
  memset(d, 0, sizeof(d[0])*w);
  d[0] = 0xFFFFFFFEU;
  k<<<64,1024>>>(d);
  cudaDeviceSynchronize();
  for (int i = w-1; i >=  0; i--) std::cout << d[i] << " ";
  std::cout << std::endl;
}
# nvcc -o t146 t146.cu -arch=sm_70
# compute-sanitizer ./t146
========= COMPUTE-SANITIZER
0 0 1 65534
========= ERROR SUMMARY: 0 errors
#

EDIT: After some additional reflection, it seems like a generalized addition operation (at least) could be constructed for a "long integer" using atomics. Here is an example:

# cat t147.cu
#include <iostream>

const int w = 4;
__device__ void my_add(unsigned *d, unsigned val){

  bool done = false;
  int i = 0;
  unsigned ret = atomicAdd(d, val);
  while (!done){
    unsigned long long test = (unsigned long long)ret + (unsigned long long)val;
    if (test > 0xFFFFFFFFULL) {
      // a rollover occurred
       i++;
       ret = atomicAdd(d+i, 1);
       val = 1;}
    else {done = true;}
    if (i >= (w-1)) done = true;}
}

__global__ void k(unsigned *d){
  unsigned id = blockIdx.x*blockDim.x+threadIdx.x;
  my_add(d, id);
}


int main(){

  unsigned *d;
  cudaMallocManaged(&d, sizeof(d[0])*w);
  memset(d, 0, sizeof(d[0])*w);
  k<<<1024,1024>>>(d);
  cudaDeviceSynchronize();
  for (int i = w-1; i >=  0; i--) std::cout << d[i] << " ";
  std::cout << std::endl;
}
# nvcc -o t147 t147.cu
# compute-sanitizer ./t147
========= COMPUTE-SANITIZER
0 0 127 4294443008
========= ERROR SUMMARY: 0 errors
#

To check the results, we note that this operation is basically the sum of integers from 0 to 1048575. That is 1048575x1048576/2 = 549,755,289,600. The code produced a result of 127x4,294,967,296 + 4,294,443,008 = 549,755,289,600.

Both approaches would require care to be taken if the values are read outside of the add/increment routine. The critical section method lends itself more easily to a "safe read" routine, if such were required. Use the same semaphore, and just read the value(s). The atomic method would be more difficult to manage in this respect.