arraysgpuopenaccpgipgi-accelerator

Join array results in OpenACC


I'm writing an OpenACC code that has an array dependence. Each iteration of inner loop can update the same position of array. Here's some code:

    long unsigned int digits[d + 11];
    for (long unsigned int digit = 0; digit < d + 11; ++digit)
            digits[digit] = 0;

    for (long unsigned int i = 1; i <= n; ++i) {
            long unsigned int remainder = 1;
            for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
                    long unsigned int div = remainder / i;
                    long unsigned int mod = remainder % i;
                    digits[digit] += div; // here
                    remainder = mod * 10;
            }
    }

OpenMP version was writing as follows:

    #pragma omp parallel private(i)
    {
            long unsigned int digit_local[d+11];
            for(i=0;i<d+11;i++)
                    digit_local[i] = 0;

            #pragma omp for
            for (i = 1; i <= n; ++i) {
                    long unsigned int remainder = 1;
                    for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
                            long unsigned int div = remainder / i;
                            long unsigned int mod = remainder % i;
                            digit_local[digit] += div;
                            remainder = mod * 10;
                    }
            }

            #pragma omp critical
            for(long unsigned int digit = 0; digit < d+11; ++digit)
                    digits[digit] += digit_local[digit];

    }

In OpenACC the keyword private works with arrays but I have no idea in how join the private arrays with global array.

Thanks.


Solution

  • You would use an OpenACC "atomic update" directive.

                #pragma acc atomic update
                digits[digit] += div; // here
    

    Alternatively, you could do something similar as your OpenMP version.

        long unsigned int digit_local[d+11][n];
        #pragma acc data create(digit_local) copyout(digits)
        {
    
        #pragma acc parallel loop gang vector
        for (i = 1; i <= n; ++i) {
              for(j=0;j<d+11;j++) digit_local[j][i] = 0;
                long unsigned int remainder = 1;
                for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
                        long unsigned int div = remainder / i;
                        long unsigned int mod = remainder % i;
                        digit_local[digit][i] += div;
                        remainder = mod * 10;
                }
        }
    
        #pragma acc parallel loop gang
        for(long unsigned int digit = 0; digit < d+11; ++digit) {
              long unsigned int dsum = 0;
              #pragma acc loop vector reduction(+:dsum)
              for (i = 1; i <= n; ++i) {
                dsum += digit_local[digit][i];
              }
              digits[digit] = dsum;
        }
        }
    

    Though, I'm not sure this will see any speed-up either.

    Hope this helps, Mat