I've encountered a situation where the code generates different results in the case of having arrays defined inside the loop on index i
(case #1) and in the case of declaring them outside the loop on the i
index and using the clause private
(case #2).
Case #2 generates the same results of the code running on CPU only.
Case #1
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
#pragma acc loop
for (i = ibeg; i <= iend; i++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
.
.
.
}
}}
Case #2
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(Rc[:NFLX][:NFLX], eta[:NFLX], \
um[:NFLX], lambda[:NFLX], alambda[:NFLX], \
dv[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
.
.
.
}
}}
I have the following values:
NFLX = 8;
jbeg = 3, jend = 258;
ibeg = 3, iend = 1026;
In which cases the two techniques are equivalent and when it is better to choose one over the other?
This is what I see with -Minfo=accel:
case #1:
71, Local memory used for Rc,dv,fR,um,lambda,alambda,fL,eta
case #2:
71, Local memory used for Rc,dv,fR,lambda,alambda,fL,eta
CUDA shared memory used for Rc,eta
Local memory used for um
CUDA shared memory used for um,lambda,alambda,dv,fL,fR
function:
/* ********************************************************************* */
void Roe_Solver (Data *d, timeStep *Dts, Grid *grid, RBox *box)
/*
* Solve the Riemann problem between L/R states using a
* Rusanov-Lax Friedrichs flux.
*********************************************************************** */
{
int i, j, k;
int ibeg = *(box->nbeg)-1, iend = *(box->nend);
int jbeg = *(box->tbeg), jend = *(box->tend);
int kbeg = *(box->bbeg), kend = *(box->bend);
int VXn = VX1, VXt = VX2, VXb = VX3;
int MXn = MX1, MXt = MX2, MXb = MX3;
int ni, nj;
double gmm = GAMMA_EOS;
double gmm1 = gmm - 1.0;
double gmm1_inv = 1.0/gmm1;
double delta = 1.e-7;
double delta_inv = 1.0/delta;
ARRAY_OFFSET (grid, ni, nj);
INDEX_CYCLE (grid->dir, VXn, VXt, VXb);
INDEX_CYCLE (grid->dir, MXn, MXt, MXb);
#pragma acc parallel loop collapse(2) present(d, Dts, grid)
for (k = kbeg; k <= kend; k++){
for (j = jbeg; j <= jend; j++){
long int offset = ni*(j + nj*k);
double * __restrict__ cmax = &Dts->cmax [offset];
double * __restrict__ SL = &d->sweep.SL [offset];
double * __restrict__ SR = &d->sweep.SR [offset];
double um[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(um[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
int nv;
double scrh, vel2;
double a2, a, h;
double alambda, lambda, eta;
double s, c, hl, hr;
double bmin, bmax, scrh1;
double pL, pR;
double * __restrict__ vL = d->sweep.vL [offset + i];
double * __restrict__ vR = d->sweep.vR [offset + i];
double * __restrict__ uL = d->sweep.uL [offset + i];
double * __restrict__ uR = d->sweep.uR [offset + i];
double * __restrict__ flux = d->sweep.flux[offset + i];
double a2L = SoundSpeed2 (vL);
double a2R = SoundSpeed2 (vR);
PrimToCons (vL, uL);
PrimToCons (vR, uR);
Flux (vL, uL, fL, grid->dir);
Flux (vR, uR, fR, grid->dir);
pL = vL[PRS];
pR = vR[PRS];
s = sqrt(vR[RHO]/vL[RHO]);
um[RHO] = vL[RHO]*s;
s = 1.0/(1.0 + s);
c = 1.0 - s;
um[VX1] = s*vL[VX1] + c*vR[VX1];
um[VX2] = s*vL[VX2] + c*vR[VX2];
um[VX3] = s*vL[VX3] + c*vR[VX3];
vel2 = um[VX1]*um[VX1] + um[VX2]*um[VX2] + um[VX3]*um[VX3];
hl = 0.5*(vL[VX1]*vL[VX1] + vL[VX2]*vL[VX2] + vL[VX3]*vL[VX3]);
hl += a2L*gmm1_inv;
hr = 0.5*(vR[VX1]*vR[VX1] + vR[VX2]*vR[VX2] + vR[VX3]*vR[VX3]);
hr += a2R*gmm1_inv;
h = s*hl + c*hr;
/* ----------------------------------------------------
1. the following should be equivalent to
scrh = dv[VX1]*dv[VX1] + dv[VX2]*dv[VX2] + dv[VX3]*dv[VX3];
a2 = s*a2L + c*a2R + 0.5*gmm1*s*c*scrh;
and therefore always positive.
---------------------------------------------------- */
a2 = gmm1*(h - 0.5*vel2);
a = sqrt(a2);
/* ----------------------------------------------------------------
2. define non-zero components of conservative eigenvectors Rc,
eigenvalues (lambda) and wave strenght eta = L.du
---------------------------------------------------------------- */
#pragma acc loop seq
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]);
/* ---- (u - c_s) ---- */
SL[i] = um[VXn] - a;
/* ---- (u + c_s) ---- */
SR[i] = um[VXn] + a;
/* ---- get max eigenvalue ---- */
cmax[i] = fabs(um[VXn]) + a;
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]) - 0.5*cmax[i]*(uR[nv] - uL[nv]);
#if DIMENSIONS > 1
/* ---------------------------------------------
3. use the HLL flux function if the interface
lies within a strong shock.
The effect of this switch is visible
in the Mach reflection test.
--------------------------------------------- */
scrh = fabs(vL[PRS] - vR[PRS]);
scrh /= MIN(vL[PRS],vR[PRS]);
if (scrh > 0.5 && (vR[VXn] < vL[VXn])){ /* -- tunable parameter -- */
bmin = MIN(0.0, SL[i]);
bmax = MAX(0.0, SR[i]);
scrh1 = 1.0/(bmax - bmin);
#pragma acc loop seq
for (nv = 0; nv < NFLX; nv++){
flux[nv] = bmin*bmax*(uR[nv] - uL[nv])
+ bmax*fL[nv] - bmin*fR[nv];
flux[nv] *= scrh1;
}
}
#endif /* DIMENSIONS > 1 */
} /* End loop on i */
}} /* End loop on j,k */
}
Technically they are equivalent, but in practice different. What's happening is that the compiler will hoist the declaration of these arrays outside of the loops. This is standard practice for the compiler and happens before the OpenACC directives are applied. What should happen is that then these arrays are implicitly privatized within the scoping unit they are declared. However the compiler doesn't currently track this so the arrays are implicitly copied into the compute region as shared arrays. If you add the flag "-Minfo=accel", you'll see the compiler feedback messages indicating the implicit copies.
I have an open issue report requesting this support, TPR #31360, however it's been a challenge to implement so not in a released compiler as of yet. Hence until/if we can fix the behavior, you'll need to manually hoist the declaration of these arrays and then add them to a "private" clause.