cudathruststencil-buffergpu-shared-memory

How to implement a Thrust version of a 1D stencil code?


Basically, is it possible to implement 1D stencil kernel shown below using pure Thrust? I want this implementation to be as efficient as possible which implies that Thrust should somehow know that there is a multiple access to the same elements and shared memory access needs to be used.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#define BLOCK_SIZE 128
#define RADIUS 8
#define SIZE 1024*1024*8
const dim3 DimBlock(BLOCK_SIZE);
const dim3 DimGrid(SIZE/BLOCK_SIZE);

__global__ void stencil_1d(const int * in, int *out) {
    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    if( gindex < SIZE ) temp[lindex] = in[gindex]; else temp[lindex] = 0;
    if (threadIdx.x < RADIUS) {
    if(gindex - RADIUS>=0 )temp[lindex - RADIUS] = in[gindex - RADIUS]; else  temp[lindex - RADIUS] = 0;
    if(gindex + BLOCK_SIZE < SIZE )  temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; else  temp[lindex + BLOCK_SIZE] = 0;
    }

    // Synchronize (ensure all the data is available)
    __syncthreads();

    // Apply the stencil
    int result = 0;
    for (int offset = -RADIUS; offset <= RADIUS; offset++)
    if( gindex < SIZE ) result += temp[lindex + offset];

    // Store the result
    if( gindex < SIZE ) out[gindex] = result;
}

int main()
{
    cudaError_t cudaStat;
    thrust::device_vector<int> dev_vec_inp(SIZE,1);
    thrust::device_vector<int> dev_vec_out(SIZE);
    try 
    {
        stencil_1d<<< DimGrid, DimBlock >>>(thrust::raw_pointer_cast(dev_vec_inp.data()) , thrust::raw_pointer_cast(dev_vec_out.data()));
        cudaStat =  cudaGetLastError(); 
        if (cudaStat != cudaSuccess)
        throw cudaGetErrorString(cudaStat);
        else std::cout<<"1D stencil has been executed successfully" << std:: endl;
    }   
    catch(const char* e)
    {
        std::cout<< e;
    }
}

Solution

  • For this particular type of stencil op (+) you could use a prefix sum method. You would perform a prefix sum first, then use the stencil radius to subtract the left end of the stencil window from the right end of the stencil window.

    Here is a rough sketch:

    $ cat t1777.cu
    #include <thrust/device_vector.h>
    #include <thrust/scan.h>
    #include <thrust/copy.h>
    #include <thrust/transform.h>
    #include <thrust/host_vector.h>
    #include <iostream>
    
    const int ds = 20;
    const int stencil_radius = 7;
    using namespace thrust::placeholders;
    typedef int mt;
    
    int main(){
    
      thrust::device_vector<mt> data(ds, 1);
      thrust::device_vector<mt> result(ds-(2*stencil_radius));
      thrust::inclusive_scan(data.begin(), data.end(), data.begin());
      thrust::transform(data.begin(), data.end()-(2*stencil_radius),data.begin()+(2*stencil_radius), result.begin(), _2-_1);
      thrust::host_vector<mt> h_result = result;
      thrust::copy(h_result.begin(), h_result.end(), std::ostream_iterator<mt>(std::cout, ","));
      std::cout << std::endl;
    }
    $ nvcc -o t1777 t1777.cu
    $ ./t1777
    14,14,14,14,14,14,
    $
    

    If you run the above code under a profiler, you can determine that one of the kernels thrust launches for the prefix sum op does use shared memory.

    I really haven't tried to create a proper stencil window consisting of a radius on the left, radius on the right, and the stencil position in the center, but that should only require slight modification to what I have shown above.

    I'm not making any claims that this is better or faster than the "pure CUDA" method, but it seems to be one way to perform it using a "pure thrust" method, with thrust making use of shared memory.

    I'm not suggesting this code is defect-free or suitable for any particular purpose. It is provided merely to demonstrate an idea. Use it at your own risk.