I'm writing an OpenACC code that has an array dependence. Each iteration of inner loop can update the same position of array. Here's some code:
long unsigned int digits[d + 11];
for (long unsigned int digit = 0; digit < d + 11; ++digit)
digits[digit] = 0;
for (long unsigned int i = 1; i <= n; ++i) {
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digits[digit] += div; // here
remainder = mod * 10;
}
}
OpenMP version was writing as follows:
#pragma omp parallel private(i)
{
long unsigned int digit_local[d+11];
for(i=0;i<d+11;i++)
digit_local[i] = 0;
#pragma omp for
for (i = 1; i <= n; ++i) {
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digit_local[digit] += div;
remainder = mod * 10;
}
}
#pragma omp critical
for(long unsigned int digit = 0; digit < d+11; ++digit)
digits[digit] += digit_local[digit];
}
In OpenACC the keyword private works with arrays but I have no idea in how join the private arrays with global array.
Thanks.
You would use an OpenACC "atomic update" directive.
#pragma acc atomic update
digits[digit] += div; // here
Alternatively, you could do something similar as your OpenMP version.
long unsigned int digit_local[d+11][n];
#pragma acc data create(digit_local) copyout(digits)
{
#pragma acc parallel loop gang vector
for (i = 1; i <= n; ++i) {
for(j=0;j<d+11;j++) digit_local[j][i] = 0;
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digit_local[digit][i] += div;
remainder = mod * 10;
}
}
#pragma acc parallel loop gang
for(long unsigned int digit = 0; digit < d+11; ++digit) {
long unsigned int dsum = 0;
#pragma acc loop vector reduction(+:dsum)
for (i = 1; i <= n; ++i) {
dsum += digit_local[digit][i];
}
digits[digit] = dsum;
}
}
Though, I'm not sure this will see any speed-up either.
Hope this helps, Mat