Say you have a cuArray
for binding a surface object.
Something of the form:
// These are inputs to a function really.
cudaArray* d_cuArrSurf
cudaSurfaceObject_t * surfImage;
const cudaExtent extent = make_cudaExtent(width, height, depth);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);
// Bind to Surface
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = d_cuArrSurf;
cudaCreateSurfaceObject(surfImage, &surfRes);
Now, I want to initialize this cuArray
to zero. Apparently there is non memset
for cuArray
type of objects. What would be the best way to do this? Maybe multiple options are possible, and some may have better or worse features. Which are these options?
I can think of
allocate and zero host memory and copy it using cudaMemcpy3D()
.
create an initialization kernel and write it with surf3Dwrite()
Would it be possible for you to show an example of those lines?
Here is a rough example, roughly extending the previous rough example:
$ cat t1648.cu
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
__device__ float my_common(float *d, int width, unsigned int x, unsigned int y){
// 200 lines of common code...
return d[y *width +x];
}
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void WriteKernel(float *gIData, int width, int height,
cudaSurfaceObject_t outputSurface)
{
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;
// read from global memory and write to cuarray (via surface reference)
surf3Dwrite(my_common(gIData, width, x, y),
outputSurface, x*4, y, z, cudaBoundaryModeTrap);
}
__global__ void WriteKernel(float *gIData, int width, int height,
float *out)
{
// calculate coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// read from global memory and write to global memory
out[y*width+x] = my_common(gIData, width, x, y);
}
__global__ void ReadKernel(float tval, cudaSurfaceObject_t outputSurface)
{
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;;
// read from global memory and write to cuarray (via surface reference)
float val;
surf3Dread(&val,
outputSurface, x*4, y, z, cudaBoundaryModeTrap);
if (val != tval) printf("oops\n");
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("starting...\n");
unsigned width = 256;
unsigned height = 256;
unsigned depth = 256;
unsigned int size = depth*width * height * sizeof(float);
// Allocate device memory for result
float *dData = NULL;
cudaMalloc((void **) &dData, size);
// Allocate array and copy image data
float *out, *h_out;
h_out = new float[height*width*depth];
float tval = 1.0f;
for (int i = 0; i < height*width*depth; i++) h_out[i] = tval;
cudaArray* d_cuArrSurf;
cudaSurfaceObject_t surfImage;
const cudaExtent extent = make_cudaExtent(width, height, depth);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);
// Bind to Surface
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = d_cuArrSurf;
cudaCreateSurfaceObject(&surfImage, &surfRes);
cudaMalloc(&out, size);
cudaMemcpy(out, h_out, size, cudaMemcpyHostToDevice);
dim3 dimBlock(8, 8, 8);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
// initialize array
cudaMemcpy3DParms p = {0};
p.srcPtr = make_cudaPitchedPtr(out, width*sizeof(out[0]), width, height);
p.srcPos = make_cudaPos(0,0,0);
p.dstArray = d_cuArrSurf;
p.dstPos = make_cudaPos(0,0,0);
p.extent = make_cudaExtent(width, height, 1);
p.kind = cudaMemcpyDefault;
for (int i = 0; i < depth; i++){
cudaMemcpy3D(&p);
p.dstPos = make_cudaPos(0,0, i+1);}
ReadKernel<<<dimGrid, dimBlock>>>(tval, surfImage);
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, surfImage);
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, out);
cudaDeviceSynchronize();
}
$ nvcc -o t1648 t1648.cu
$ cuda-memcheck ./t1648
========= CUDA-MEMCHECK
starting...
========= ERROR SUMMARY: 0 errors
$
The (total) extent above is 256x256x256. So I chose to do a 256x256 transfer (per-transfer extent) (basically each z-slice) over 256 iterations of cudaMemcpy3D
. It seems to pass the sniff test.
I used 1 as my initializing value for device memory here "just because". If you wanted to make this faster and initialize to zero, skip the host->device copy and just use cudaMemset to initialize the linear memory (source for 3D transfer) to zero.