CUDA __shfl_down_sync does not work with __match_any_sync

Kernel foo's goal is to compute the sum of values that has the same id as id[0]. I checked that the mask acquired from __match_any_sync correctly identifies all other threads in the warp with the same id. The if condition also works as expected; it only allows thread 0 and all other threads with the same id enter. However, the sum produced from __shfl_down_sync is wrong.

vals and ids are input arrays of length 32. opt is the output that stores the final sum. I tried with only one warp (foo<<<1,32>>>).

__global__ void foo(const float *vals, unsigned *ids, float *opt) {
    unsigned laneId = threadIdx.x & 0x1f;
    float val = vals[laneId];
    unsigned mask = __match_any_sync(__activemask(), ids[laneId]);

    if (mask & 0b1) {
        for (int i = 16; i > 0; i /= 2) {
            val += __shfl_down_sync(mask, val, i);
        }

        if (laneId == 0) {
            atomicAdd(opt, val);
        }
    }
}

There is also another kernel bar that does not have ids. It just computes the sum of all values on even indexes (basically as if the ids at all even indexes are 1 and all odd indexes are 0). This produces the correct sum.

__global__ void bar(const float *vals, float *opt) {
    int laneId = threadIdx.x & 0x1f;
    float val = vals[laneId];

    if (laneId % 2 == 0) {
        for (int i = 16; i >= 1; i /= 2) {
            val += __shfl_down_sync(0b0101010101010101010101010101010101010101010101010101010101010101, val, i);
        }

        if (laneId == 0) {
            atomicAdd(opt, val);
        }
    }
}

I've read the Nvidia blogs 1 2. I still don't understand what is wrong with foo :( My best guess is there's some warp sync issue between __match_any_sync and __shfl_down_sync. Explanation of what went wrong would be very helpful!

Environment: WIN11 CUDA-12.2 CC89

A working example showing the wrong results:

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <random>

#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define FILE_LINE __FILE__ ":" STR(__LINE__)
#define CUDA_CHECK_THROW(x)                                                                                               \
    do {                                                                                                                  \
        cudaError_t result = x;                                                                                           \
        if (result != cudaSuccess) {                                                                                      \
            std::cout << FILE_LINE << " CUDA ERROR: " << cudaGetErrorString(result) << std::endl;              \
            exit(-1);                                                                                                     \
        }                                                                                                                 \
    } while(0);

__global__ void foo(const float *vals, unsigned *ids, float *opt) {
    unsigned laneId = threadIdx.x & 0x1f;
    float val = vals[laneId];
    unsigned mask = __match_any_sync(__activemask(), ids[laneId]);

    if (mask & 0b1) {
        for (int i = 16; i > 0; i /= 2) {
            val += __shfl_down_sync(mask, val, i);
        }

        if (laneId == 0) {
            atomicAdd(opt, val);
        }
    }
}


int main() {
    std::random_device rd;
    std::mt19937 mt(rd());
    std::uniform_real_distribution<float> dist(0.0, 1.5);
    float *r, *val;
    unsigned *mask;
    CUDA_CHECK_THROW(cudaMalloc(&r, sizeof(float)))

    CUDA_CHECK_THROW(cudaMalloc(&val, sizeof(float) * 32))
    CUDA_CHECK_THROW(cudaMalloc(&mask, sizeof(unsigned) * 32))
    CUDA_CHECK_THROW(cudaMemset(r, 0, sizeof(float))
    float h_val[32];
    unsigned h_mask[32];
    float sum = 0.0f;

    for (int i = 31; i >= 0; --i) {
        float v = dist(mt);
        h_val[i] = v;

        if (i == 0 || dist(mt) >= 0.5f) {
            sum += v;
            h_mask[i] = 1;
        } else {
            h_mask[i] = 0;
        }
    }

    CUDA_CHECK_THROW(cudaMemcpy(val, h_val, 32 * sizeof(float), cudaMemcpyHostToDevice))
    CUDA_CHECK_THROW(cudaMemcpy(mask, h_mask, 32 * sizeof(unsigned), cudaMemcpyHostToDevice))

    foo<<<1, 32>>>(val, mask, r);

    float hv = 0.0f;
    CUDA_CHECK_THROW(cudaMemcpy(&hv, r, sizeof(float), cudaMemcpyDeviceToHost));

    std::cout << "EXPECTED " << sum << " ;ACTUAL " << hv << std::endl;

    return 0;
}

The host generates a 32-item array with randomly initialized floating point values. Only threads in the warp with its id==1 should add to the sum. However, the result from the device is wrong. An example output:

EXPECTED: 16.1606; ACTUAL: 2.51008

Solution

Your usage of __shfl_down_sync is invalid. The programming guide states:

Threads may only read data from another thread which is actively participating in the __shfl_sync() command. If the target thread is inactive, the retrieved value is undefined.

However, you attempt to shuffle data from inactive threads. For example, __shfl_down_sync(0x55555555, val, 1) will shuffle data from all inactive threads. The delta does not only account for threads with set bits in the mask.

A reduction for non-contiguous active threads is more complicated and can be found, for example, in the CUDA file cuda-12.1/include/cooperative_groups/details/coalesced_scan.h , function inclusive_scan_non_contiguous.

There is a very simple solution to your problem without requiring the match function or non-contiguous reduction. Just set the input value of the reduction to 0 for all threads with different selector ids and use all threads for reduction.

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <random>

#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define FILE_LINE __FILE__ ":" STR(__LINE__)
#define CUDA_CHECK_THROW(x)                                                                                               \
    do {                                                                                                                  \
        cudaError_t result = x;                                                                                           \
        if (result != cudaSuccess) {                                                                                      \
            std::cout << FILE_LINE << " CUDA ERROR: " << cudaGetErrorString(result) << std::endl;              \
            exit(-1);                                                                                                     \
        }                                                                                                                 \
    } while(0);

__global__ void foo(const float *vals, unsigned *ids, float *opt) {
    unsigned laneId = threadIdx.x & 0x1f;
    float val = vals[laneId];
    unsigned mask = __match_any_sync(__activemask(), ids[laneId]);

    if (mask & 0b1) {
        for (int i = 16; i > 0; i /= 2) {
            val += __shfl_down_sync(mask, val, i);
        }

        if (laneId == 0) {
            atomicAdd(opt, val);
        }
    }
}

__global__
void kernel2(const float *vals, unsigned *ids, float *opt) {
    unsigned laneId = threadIdx.x & 0x1f;
    float val = 0;
    if(ids[laneId] == ids[0]){
        val = vals[laneId];
    }
    for (int i = 16; i > 0; i /= 2) {
        val += __shfl_down_sync(0xFFFFFFFF, val, i);
    }
    if (laneId == 0) {
        atomicAdd(opt, val);
    }
}


int main() {
    std::random_device rd;
    std::mt19937 mt(rd());
    std::uniform_real_distribution<float> dist(0.0, 1.5);
    float *r, *val;
    unsigned *mask;
    CUDA_CHECK_THROW(cudaMalloc(&r, sizeof(float)))

    CUDA_CHECK_THROW(cudaMalloc(&val, sizeof(float) * 32))
    CUDA_CHECK_THROW(cudaMalloc(&mask, sizeof(unsigned) * 32))
    CUDA_CHECK_THROW(cudaMemset(r, 0, sizeof(float)))
    float h_val[32];
    unsigned h_mask[32];
    float sum = 0.0f;

    for (int i = 31; i >= 0; --i) {
        float v = dist(mt);
        h_val[i] = v;

        if (i == 0 || dist(mt) >= 0.5f) {
            sum += v;
            h_mask[i] = 1;
        } else {
            h_mask[i] = 0;
        }
    }

    CUDA_CHECK_THROW(cudaMemcpy(val, h_val, 32 * sizeof(float), cudaMemcpyHostToDevice))
    CUDA_CHECK_THROW(cudaMemcpy(mask, h_mask, 32 * sizeof(unsigned), cudaMemcpyHostToDevice))

    foo<<<1, 32>>>(val, mask, r);

    float hv = 0.0f;
    CUDA_CHECK_THROW(cudaMemcpy(&hv, r, sizeof(float), cudaMemcpyDeviceToHost));

    std::cout << "EXPECTED " << sum << " ;ACTUAL " << hv << std::endl;

    CUDA_CHECK_THROW(cudaMemset(r, 0, sizeof(float)))
    kernel2<<<1, 32>>>(val, mask, r);

    CUDA_CHECK_THROW(cudaMemcpy(&hv, r, sizeof(float), cudaMemcpyDeviceToHost));

    std::cout << "EXPECTED " << sum << " ;ACTUAL " << hv << std::endl;

    return 0;
}