memcpy_async does not work with pipeline roles

If I do a memcpy_async on a per thread basis, everything works fine, see the test_memcpy32 below. This code prefetches data within a single warp.

I want to expand this, so that I can prefetch data in warp 0 (of a block) and use it (later) in warp 1 of that some block. For this I need to use a block pipeline.
However, if I do a memcpy_async on a thread_block granularity, then I cannot get it to work, see test_memcpy.
I based my code on the CUDA pipeline documentation.

Here's my MCVE:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cooperative_groups.h>
#include <cuda/pipeline>
#include <cassert>
#include <stdio.h>

static constexpr auto consumer = cuda::pipeline_role::consumer;
static constexpr auto producer = cuda::pipeline_role::producer;

__global__ void test_memcpy32(int gdata[32]) {
    assert(blockDim.x == 32);
    __shared__ int sdata[32];
    auto pipeline_warp = cuda::make_pipeline(); //per thread
    pipeline_warp.producer_acquire(); //noop for per thread
    const auto size = threadIdx.x & 1 ? sizeof(int): 0;
    cuda::memcpy_async(&sdata[threadIdx.x], &gdata[threadIdx.x], size, pipeline_warp);
    pipeline_warp.producer_commit();
    //do other stuff
    pipeline_warp.consumer_wait();
    printf("tid: %i: gdata[%i] = %i, sdata[%i] = %i\n", threadIdx.x, threadIdx.x, gdata[threadIdx.x], threadIdx.x, sdata[threadIdx.x]);
    pipeline_warp.consumer_release(); //noop for per thread

}

__global__ void test_memcpy(int gdata[32]) {
    assert(blockDim.x == 64);
    const auto group = cooperative_groups::this_thread_block();
    __shared__ cuda::pipeline_shared_state<cuda::thread_scope_block, 1> PipelineState;
    const auto warpid = threadIdx.x >= warpSize;
    const auto laneid = threadIdx.x & 31;
    //warp 0 = consumer, warp 1 = producer
    const auto role = warpid ? producer : consumer;
    auto pipeline_test = cuda::make_pipeline(group, &PipelineState, role);
    __shared__ int sdata[32];
    sdata[laneid] = 0;
    const size_t size = threadIdx.x & 1 ? 4 : 4;
    __syncthreads();
    if (role == producer) {
        pipeline_test.producer_acquire();
        cuda::memcpy_async(group, &sdata[laneid], &gdata[laneid], size, pipeline_test);
        pipeline_test.producer_commit();
    } else {
        assert(role == consumer);
        pipeline_test.consumer_wait();
        printf("tid: %i: gdata[%i] = %i, sdata[%i] = %i\n", threadIdx.x, laneid, gdata[laneid], laneid, sdata[laneid]);
        pipeline_test.consumer_release();
    }
}

int main() {
    int* gdata;
    int buffer[32];
    cudaMalloc(&gdata, sizeof(buffer));
    for (auto i = 0; auto& d: buffer) { d = i++; }
    cudaMemcpy(gdata, buffer, sizeof(buffer), cudaMemcpyHostToDevice);
    test_memcpy<<<1, 64>>>(gdata);  //does not work
    test_memcpy32<<<1, 32>>>(gdata); //works just fine.
    cudaDeviceSynchronize();
}

The per thread code: test_memcpy32 outputs the correct data, but the per block code does not work. It outputs all zeros, even though I followed the CUDA documentation.

I'm using CUDA 13 and Visual studio 17.9.6 on a GTX 3070 (aka compute 86). Updating the version to the latest VS makes no difference. Note that this code needs Ampere sm_80 or above to run, so it does not work on godbolt, because that uses a sm_75 T4 GPU.

If I run compute-sanitizer with the synccheck tool I get:

compute-sanitizer --tool=synccheck .\x64\Debug\MCVE_memcpy_async.exe
========= COMPUTE-SANITIZER
========= Barrier error detected. Missing wait.
=========     at unsigned long long cuda::ptx::__4::mbarrier_arrive<void>(unsigned long long *)+0x1b0 in mbarrier_arrive.h:21
=========     by thread (32,0,0) in block (0,0,0)
//Repeated for all threads in warp 1
=========     Barrier is located at shared address 0x0
=========         Device Frame: cuda::__4::barrier<(cuda::std::__4::thread_scope)2, cuda::std::__4::__empty_completion>::arrive(long long)+0x9a0 in barrier_block_scope.h:130
=========         Device Frame: cuda::__4::pipeline<(cuda::std::__4::thread_scope)2>::producer_commit()+0x6e0 in pipeline:270
=========         Device Frame: test_memcpy(int *)+0x1470 in kernel.cu:57
=========     Saved host backtrace up to driver entry point at kernel launch time
=========         Host Frame: main in kernel.cu:72 [0x7a59] in MCVE_memcpy_async.exe

What am I doing wrong?

Solution

As stated in the question, the simpler thread-based memcpy_async works fine.

The error in the block based code: test_memcp is in the line: cuda::memcpy_async(group, &sdata[laneid], &gdata[laneid], size, pipeline_test);
The pipeline is already a group'ed pipeline, putting the group as the first parameter links it to the wrong overload. If you remove that parameter. It links to the correct overload and everything works.
The 'group' overload expects all threads calling memcpy_async to have the same parameters (and then somehow that implementation will assign different parts of the prefetched data block to different threads).

The following code works:

__global__ void test_memcpy(int gdata[32]) {
    assert(blockDim.x == 64);
    const auto group = cooperative_groups::this_thread_block();
    __shared__ cuda::pipeline_shared_state<cuda::thread_scope_block, 1> PipelineState;
    const auto warpid = threadIdx.x >= warpSize;
    const auto laneid = threadIdx.x & 31;
    //warp 0 = consumer, warp 1 = producer
    const auto role = warpid ? producer : consumer;
    auto pipeline_test = cuda::make_pipeline(group, &PipelineState, role);
    __shared__ int sdata[32];
    sdata[laneid] = 0;
    const size_t size = threadIdx.x & 1 ? 4 : 4;
    __syncthreads();
    if (role == producer) {
        pipeline_test.producer_acquire();
        //**************************************
        //do *not* use group in the memcpy_async
        cuda::memcpy_async(/*group,*/ &sdata[laneid], &gdata[laneid], size, pipeline_test);
        //*************************************
        pipeline_test.producer_commit();
    } else {
        assert(role == consumer);
        pipeline_test.consumer_wait();
        printf("tid: %i: gdata[%i] = %i, sdata[%i] = %i\n", threadIdx.x, laneid, gdata[laneid], laneid, sdata[laneid]);
        pipeline_test.consumer_release();
    }
}

int main() {
    int* gdata;
    int buffer[32];
    cudaMalloc(&gdata, sizeof(buffer));
    for (auto i = 0; auto& d: buffer) { d = i++; }
    cudaMemcpy(gdata, buffer, sizeof(buffer), cudaMemcpyHostToDevice);
    test_memcpy<<<1, 64>>>(gdata);  //now it works
    //test_memcpy32<<<1, 32>>>(gdata); //works just fine.
    cudaDeviceSynchronize();
}