memset cuArray for surface memory

Say you have a cuArray for binding a surface object.

Something of the form:

// These are inputs to a function really.
cudaArray* d_cuArrSurf
cudaSurfaceObject_t * surfImage;

const cudaExtent extent = make_cudaExtent(width, height, depth);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);

// Bind to Surface
cudaResourceDesc    surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array  = d_cuArrSurf;

cudaCreateSurfaceObject(surfImage, &surfRes);

Now, I want to initialize this cuArray to zero. Apparently there is non memset for cuArray type of objects. What would be the best way to do this? Maybe multiple options are possible, and some may have better or worse features. Which are these options?

I can think of

allocate and zero host memory and copy it using cudaMemcpy3D().
create an initialization kernel and write it with surf3Dwrite()

Solution

Would it be possible for you to show an example of those lines?

Here is a rough example, roughly extending the previous rough example:

$ cat t1648.cu
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>


__device__ float my_common(float *d, int width, unsigned int x, unsigned int y){

// 200 lines of common code...
  return d[y *width +x];
}




////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void WriteKernel(float *gIData, int width, int height,
                                       cudaSurfaceObject_t outputSurface)
{
    // calculate surface coordinates
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;
    // read from global memory and write to cuarray (via surface reference)
    surf3Dwrite(my_common(gIData, width, x, y),
                outputSurface, x*4, y, z, cudaBoundaryModeTrap);
}

__global__ void WriteKernel(float *gIData, int width, int height,
                                       float *out)
{
    // calculate coordinates
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // read from global memory and write to global memory
    out[y*width+x] = my_common(gIData, width, x, y);
}

__global__ void ReadKernel(float tval, cudaSurfaceObject_t outputSurface)
{
    // calculate surface coordinates
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;;
    // read from global memory and write to cuarray (via surface reference)
    float val;
    surf3Dread(&val,
                outputSurface, x*4, y, z, cudaBoundaryModeTrap);
    if (val != tval) printf("oops\n");
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("starting...\n");


    unsigned width = 256;
    unsigned height = 256;
    unsigned depth = 256;
    unsigned int size = depth*width * height * sizeof(float);

    // Allocate device memory for result
    float *dData = NULL;
    cudaMalloc((void **) &dData, size);

    // Allocate array and copy image data
    float *out, *h_out;
    h_out = new float[height*width*depth];
    float tval = 1.0f;
    for (int i = 0; i < height*width*depth; i++) h_out[i] = tval;
    cudaArray* d_cuArrSurf;
    cudaSurfaceObject_t  surfImage;

    const cudaExtent extent = make_cudaExtent(width, height, depth);
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);

    // Bind to Surface
    cudaResourceDesc    surfRes;
    memset(&surfRes, 0, sizeof(cudaResourceDesc));
    surfRes.resType = cudaResourceTypeArray;
    surfRes.res.array.array  = d_cuArrSurf;

    cudaCreateSurfaceObject(&surfImage, &surfRes);

    cudaMalloc(&out, size);
    cudaMemcpy(out, h_out, size, cudaMemcpyHostToDevice);
    dim3 dimBlock(8, 8, 8);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
    // initialize array
    cudaMemcpy3DParms p = {0};
    p.srcPtr = make_cudaPitchedPtr(out, width*sizeof(out[0]), width, height);
    p.srcPos = make_cudaPos(0,0,0);
    p.dstArray = d_cuArrSurf;
    p.dstPos = make_cudaPos(0,0,0);
    p.extent = make_cudaExtent(width, height, 1);
    p.kind   = cudaMemcpyDefault;
    for (int i = 0; i < depth; i++){
      cudaMemcpy3D(&p);
      p.dstPos = make_cudaPos(0,0, i+1);}

    ReadKernel<<<dimGrid, dimBlock>>>(tval, surfImage);
    WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, surfImage);
    WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, out);
    cudaDeviceSynchronize();
}
$ nvcc -o t1648 t1648.cu
$ cuda-memcheck ./t1648
========= CUDA-MEMCHECK
starting...
========= ERROR SUMMARY: 0 errors
$

The (total) extent above is 256x256x256. So I chose to do a 256x256 transfer (per-transfer extent) (basically each z-slice) over 256 iterations of cudaMemcpy3D. It seems to pass the sniff test.

I used 1 as my initializing value for device memory here "just because". If you wanted to make this faster and initialize to zero, skip the host->device copy and just use cudaMemset to initialize the linear memory (source for 3D transfer) to zero.