cudathrust

CUDA 2nd order recursion with thrust inclusive_scan


I'm trying to understand how to parallelise a recursive calculation. Serially, the calculation takes the form:

for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }

For the i-1 index there's a solution here to a previous question of mine: CUDA force instruction execution order

I want to modify this to use the i-2 and I can't understand how to apply the same process to a 2nd order calculation. It should be possible using the thrust::inclusive_scan function, but I can't work out how. Does anyone know the solution?


Solution

  • Picking up where we left off in the previous question/answer, we shift our attention to equation 1.11 in the referenced paper by Blelloch. We observe that your problem formulation:

    for (int i = 2; i<size; i++)
      {
        result[i] = oldArray[i] + k * result[i-2];
      }
    

    seems to match that in equation 1.11 if we set m=2 and in that case we can also observe that for your formulation, all ai,1 are zero (and, as previously, all ai,2 are k).

    As per equation 1.12 in that paper, our state variable si now becomes a two-tuple:

    si = |xi xi-1|

    Taking note of these things, we observe the "correctness" of equation 1.13:

    si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|

    rewriting:

    si,1 = xi = k*xi-2 + bi

    si,2 = xi-1 = xi-1

    (In my view, the other answer leaves you at this point. That realization, i.e. result.data[0] = right + k * left.data[1]; is sufficient for a serial scan but not for a parallel scan. It's also evident that the functor/scan op there is not associative.)

    We now need to come up with a binary operator bop that is an extension of the definition in (1.7) to this case. Referring to the previous definition in equation 1.7, we extend that based on the treatment in 1.13 as follows:

    Ci = |Ai , Bi|

    where:

    Ai = |0 1, k 0|

    and:

    Bi = |bi 0|

    We then have:

    Ci bop Cj = | Ai . Aj , Bi . Aj + Bj |

    This then becomes the formula for our functor/scan operator. We will need to carry 6 scalar "state" quantities throughout: 2 for the B vector and 4 for the A matrix.

    What follows then is a realization of the above:

    $ cat t1930.cu
    #include <iostream>
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    #include <thrust/scan.h>
    #include <thrust/copy.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/iterator/constant_iterator.h>
    #include <cstdlib>
    #include <cstdio>
    template <typename T>
    void cpufunction(T *result, T *oldArray, size_t size, T k){
      for (int i = 2; i<size; i++)
      {
        result[i] = oldArray[i] + k * result[i-2];
      }
    }
    struct scan_op // as per blelloch (1.7)
    {
      template <typename T1, typename T2>
      __host__ __device__
      T1 operator()(const T1 &t1, const T2 &t2){
        T1 ret;
        thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
        thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
        thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
        thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
        thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
        thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
        return ret;
        }
    };
    
    typedef float mt;
    const size_t ds = 512;
    const mt k = 1.01;
    const int snip = 10;
    int main(){
    
      mt *b1  = new mt[ds]; // b as in blelloch (1.5)
      mt *cr = new mt[ds]; // cpu result
      for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
      cr[0] = b1[0];
      cr[1] = b1[1];
      cpufunction(cr, b1, ds, k);
      for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
      for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
      std::cout << std::endl;
      thrust::device_vector<mt> db(b1, b1+ds);
      auto b0 = thrust::constant_iterator<mt>(0);
      auto a0 = thrust::constant_iterator<mt>(0);
      auto a1 = thrust::constant_iterator<mt>(1);
      auto a2 = thrust::constant_iterator<mt>(k);
      auto a3 = thrust::constant_iterator<mt>(0);
      thrust::device_vector<mt> dx1(ds);
      thrust::device_vector<mt> dx0(ds);
      thrust::device_vector<mt> dy0(ds);
      thrust::device_vector<mt> dy1(ds);
      thrust::device_vector<mt> dy2(ds);
      thrust::device_vector<mt> dy3(ds);
      auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
      auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
      thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
      thrust::host_vector<mt> hx1 = dx1;
      thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
      thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
      std::cout << std::endl;
    }
    $ nvcc -std=c++14 t1930.cu -o t1930
    $ cuda-memcheck ./t1930
    ========= CUDA-MEMCHECK
    0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
    0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
    ========= ERROR SUMMARY: 0 errors
    $
    

    Yes, there are some results above that differ in the 6th digit. I attribute this to the limitations of float resolution when taking into account the very different order of operations between the serial and parallel method. If you change the typedef to double, the results will appear to match exactly.

    Since you've asked about it here's an equivalent realization where it is demonstrated using device data previously allocated using cudaMalloc:

    $ cat t1930.cu
    #include <iostream>
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    #include <thrust/scan.h>
    #include <thrust/copy.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/iterator/constant_iterator.h>
    #include <cstdlib>
    #include <cstdio>
    template <typename T>
    void cpufunction(T *result, T *oldArray, size_t size, T k){
      for (int i = 2; i<size; i++)
      {
        result[i] = oldArray[i] + k * result[i-2];
      }
    }
    struct scan_op // as per blelloch (1.7)
    {
      template <typename T1, typename T2>
      __host__ __device__
      T1 operator()(const T1 &t1, const T2 &t2){
        T1 ret;
        thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
        thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
        thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
        thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
        thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
        thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
        return ret;
        }
    };
    
    typedef double mt;
    const size_t ds = 512;
    const mt k = 1.01;
    const int snip = 10;
    int main(){
    
      mt *b1  = new mt[ds]; // b as in blelloch (1.5)
      mt *cr = new mt[ds]; // cpu result
      for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
      cr[0] = b1[0];
      cr[1] = b1[1];
      cpufunction(cr, b1, ds, k);
      for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
      for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
      std::cout << std::endl;
      mt *db;
      cudaMalloc(&db, ds*sizeof(db[0]));
      cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
      thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
      auto b0 = thrust::constant_iterator<mt>(0);
      auto a0 = thrust::constant_iterator<mt>(0);
      auto a1 = thrust::constant_iterator<mt>(1);
      auto a2 = thrust::constant_iterator<mt>(k);
      auto a3 = thrust::constant_iterator<mt>(0);
      thrust::device_vector<mt> dx1(ds);
      thrust::device_vector<mt> dx0(ds);
      thrust::device_vector<mt> dy0(ds);
      thrust::device_vector<mt> dy1(ds);
      thrust::device_vector<mt> dy2(ds);
      thrust::device_vector<mt> dy3(ds);
      auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
      auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
      thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
      cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
      for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
      for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
      std::cout << std::endl;
    }
    $ nvcc -std=c++14 t1930.cu -o t1930
    $ cuda-memcheck ./t1930
    ========= CUDA-MEMCHECK
    0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
    0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
    ========= ERROR SUMMARY: 0 errors
    

    There should be no significant performance difference between these two approaches. (However I happened to switch the typedef to double for this example, so that makes a difference.) Using cudaMalloc as an alternative to the device_vector for the various state vectors (dx0, dx1, dy0, dy1 ...) may be slightly faster, because device_vector first does a cudaMalloc style allocation, then launches a kernel to zero out the allocation. This zero-ing step is unnecessary for the state vectors. The pattern given here should demonstrate how you could do that, if you are interested.

    Here's a version that eliminates use of thrust::device_vector and thrust::host_vector altogether:

    #include <iostream>
    #include <thrust/device_ptr.h>
    #include <thrust/scan.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/iterator/constant_iterator.h>
    #include <cstdlib>
    
    template <typename T>
    void cpufunction(T *result, T *oldArray, size_t size, T k){
      for (int i = 2; i<size; i++)
      {
        result[i] = oldArray[i] + k * result[i-2];
      }
    }
    struct scan_op // as per blelloch (1.7)
    {
      template <typename T1, typename T2>
      __host__ __device__
      T1 operator()(const T1 &t1, const T2 &t2){
        T1 ret;
        thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
        thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
        thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
        thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
        thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
        thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
        return ret;
        }
    };
    
    typedef float mt;
    const size_t ds = 32768*4;
    const mt k = 1.001;
    const int snip = 10;
    int main(){
    
      mt *b1  = new mt[ds]; // b as in blelloch (1.5)
      mt *cr = new mt[ds]; // result
      for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
      cr[0] = b1[0];
      cr[1] = b1[1];
      cpufunction(cr, b1, ds, k);
      for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
      for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
      std::cout << std::endl;
      mt *db, *dstate;
      cudaMalloc(&db, ds*sizeof(db[0]));
      cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
      cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
      thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
      auto b0 = thrust::constant_iterator<mt>(0);
      auto a0 = thrust::constant_iterator<mt>(0);
      auto a1 = thrust::constant_iterator<mt>(1);
      auto a2 = thrust::constant_iterator<mt>(k);
      auto a3 = thrust::constant_iterator<mt>(0);
      thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
      thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
      thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
      thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
      thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
      thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
      auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
      auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
      thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
      cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
      for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
      for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
      std::cout << std::endl;
    }