In the cuda examples, e.g. here, __match_all_sync
__match_any_sync
is used.
Here is an example where a warp is split into multiple (one or more) groups that each keep track of their own atomic counter.
// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
int pred;
//const auto mask = __match_all_sync(__activemask(), ptr, &pred); //error, should be any_sync, not all_sync
const auto mask = __match_any_sync(__activemask(), ptr, &pred);
const auto leader = __ffs(mask) - 1; // select a leader
int res;
const auto lane_id = ThreadId() % warpSize;
if (lane_id == leader) { // leader does the update
res = atomicAdd(ptr, __popc(mask));
}
res = __shfl_sync(mask, res, leader); // get leader’s old value
return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}
The __match_any_sync
here splits up the threads in the warp into groups that have the same ptr
value, so that each group can update its own ptr atomically without getting in the way of other threads.
I know the nvcc compiler (since cuda 9) does this sort of optimization under the hood automatically, but this is just about the mechanics of __match_any_sync
Is there a way to do this pre compute capability 7?
EDIT: The blog article has now been modified to reflect __match_any_sync()
rather than __match_all_sync()
, so any commentary to that effect below should be disregarded. The answer below is edited to reflect this.
Based on your statement:
this is just about the mechanics of
__match_any_sync
we will focus on a replacement for __match_any_sync
itself, not any other form of rewriting the atomicAggInc
function. Therefore, we must provide a mask that has the same value as would be returned by __match_any_sync()
on cc7.0 or higher architectures.
I believe this will require a loop, which broadcasts the ptr
value, in the worst case one iteration for each thread in the warp (since each thread could have a unique ptr
value) and testing which threads have the same value. There are various ways we could "optimize" this loop for this function, so as to possibly reduce the trip count from 32 to some lesser value, based on the actual ptr
values in each thread, but such optimization in my view introduces considerable complexity, which makes the worst-case processing time longer (as is typical of early-exit optimizations). So I will demonstrate a fairly simple method without this optimization.
The other consideration is what to do in the case of the warp not being converged? For that, we can employ __activemask()
to identify that case.
Here is a worked example:
$ cat t1646.cu
#include <iostream>
#include <stdio.h>
// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
int mask;
#if __CUDA_ARCH__ >= 700
mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
#else
unsigned tmask = __activemask();
for (int i = 0; i < warpSize; i++){
#ifdef USE_OPT
if ((1U<<i) & tmask){
#endif
unsigned long long tptr = __shfl_sync(tmask, (unsigned long long)ptr, i);
unsigned my_mask = __ballot_sync(tmask, (tptr == (unsigned long long)ptr));
if (i == (threadIdx.x & (warpSize-1))) mask = my_mask;}
#ifdef USE_OPT
}
#endif
#endif
int leader = __ffs(mask) - 1; // select a leader
int res;
unsigned lane_id = threadIdx.x % warpSize;
if (lane_id == leader) { // leader does the update
res = atomicAdd(ptr, __popc(mask));
}
res = __shfl_sync(mask, res, leader); // get leader’s old value
return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}
__global__ void k(int *d){
int *ptr = d + threadIdx.x/4;
if ((threadIdx.x >= 16) && (threadIdx.x < 32))
atomicAggInc(ptr);
}
const int ds = 32;
int main(){
int *d_d, *h_d;
h_d = new int[ds];
cudaMalloc(&d_d, ds*sizeof(d_d[0]));
cudaMemset(d_d, 0, ds*sizeof(d_d[0]));
k<<<1,ds>>>(d_d);
cudaMemcpy(h_d, d_d, ds*sizeof(d_d[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < ds; i++)
std::cout << h_d[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1646 t1646.cu -DUSE_OPT
$ cuda-memcheck ./t1646
========= CUDA-MEMCHECK
0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
========= ERROR SUMMARY: 0 errors
$
(CentOS 7, CUDA 10.1.243, with device 0 being Tesla V100, device 1 being a cc3.5 device).
I've added an optional optimization for the case where the warp is diverged (i.e. tmask
is not 0xFFFFFFFF
). This can be selected by defining USE_OPT
.