cudadynamic-parallelism

CUDA device runtime api cudaMemsetAsync doesn't work


I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.

Here is my test code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>

const int size = 5;

__global__ void kernel(int *c)
{
    cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}

int main()
{
    cudaError_t cudaStatus;
    int c[size] = { 12, 12, 12, 12, 12 };
    int *dev_c = 0;

    cudaStatus = cudaSetDevice(0);
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
    kernel <<< 1, 1 >>>(dev_c);
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(dev_c);
    cudaStatus = cudaDeviceReset();

    printf("%d\n", cudaStatus);
    printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
    return 0;
}

And if I run it, I got output like this:

>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
   Creating library a.lib and object a.exp
0
{0,0,0,0,0}

When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.

Is this a bug? or I did something wrong? I'm using CUDA 8.0


Solution

  • I can confirm this appears not to work in CUDA 8 on the systems I tested it with.

    If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.

    If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:

    #include <cstring>
    #include <cstdio>
    
    const int size = 5;
    
    __global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
    {
        size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
        unsigned char* _p = (unsigned char*)p;
        for(; tid < sz; tid += blockDim.x * gridDim.x) {
           _p[tid] = val;
        }
    }
    
    __device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
    {
        const dim3 blocksz(256,1,1); 
        size_t nblocks = (sz + blocksz.x -1) / blocksz.x;
    
        unsigned charval = val & 0xff;
        myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz); 
    }
    
    __global__ void kernel(int *c)
    {
        cudaStream_t s;
        cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
        myMemset(c, 0x7FFFFFFF, size * 4, s);
        cudaDeviceSynchronize();
    }
    
    int main()
    {
        int c[size];
        int *dev_c;
    
        memset(&c[0], 0xffffff0c, size * sizeof(int));
        printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
    
        cudaMalloc((void**)&dev_c, size * sizeof(int));
        cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
        kernel <<< 1, 1 >>>(dev_c);
        cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
        cudaFree(dev_c);
    
        printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
        return 0;
    }
    

    which compiles and does this:

    $ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
    $ ./memset 
    {0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
    {ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}
    

    A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.