[SOLVED] When declaring a static array as "private" before a parallel loop is perfectly equivalent to declaring the array inside the loop?

When declaring a static array as "private" before a parallel loop is perfectly equivalent to declaring the array inside the loop?

I've encountered a situation where the code generates different results in the case of having arrays defined inside the loop on index i (case #1) and in the case of declaring them outside the loop on the i index and using the clause private (case #2). Case #2 generates the same results of the code running on CPU only.

Case #1

  #pragma acc parallel loop
  for (j = jbeg; j <= jend; j++){
  
    #pragma acc loop 
    for (i = ibeg; i <= iend; i++){
      double Rc[NFLX][NFLX];
      double eta[NFLX], um[NFLX], dv[NFLX];
      double lambda[NFLX], alambda[NFLX];
      double fL[NFLX], fR[NFLX];
      .
      .
      .
    }
  }}

Case #2

  #pragma acc parallel loop
  for (j = jbeg; j <= jend; j++){

    double Rc[NFLX][NFLX];
    double eta[NFLX], um[NFLX], dv[NFLX];
    double lambda[NFLX], alambda[NFLX];
    double fL[NFLX], fR[NFLX];

    #pragma acc loop private(Rc[:NFLX][:NFLX], eta[:NFLX],  \
                             um[:NFLX], lambda[:NFLX], alambda[:NFLX], \
                             dv[:NFLX], fL[:NFLX], fR[:NFLX])
    for (i = ibeg; i <= iend; i++){
      .
      .
      .
    }
  }}

I have the following values:

NFLX = 8;
jbeg = 3, jend = 258;
ibeg = 3, iend = 1026;

In which cases the two techniques are equivalent and when it is better to choose one over the other?

This is what I see with -Minfo=accel:

case #1:

71, Local memory used for Rc,dv,fR,um,lambda,alambda,fL,eta

case #2:

71, Local memory used for Rc,dv,fR,lambda,alambda,fL,eta
     CUDA shared memory used for Rc,eta
     Local memory used for um
     CUDA shared memory used for um,lambda,alambda,dv,fL,fR

function:

/* ********************************************************************* */
void Roe_Solver (Data *d, timeStep *Dts, Grid *grid, RBox *box)
/*
 * Solve the Riemann problem between L/R states using a
 * Rusanov-Lax Friedrichs flux.
 *********************************************************************** */
{
  int  i, j, k;
  int  ibeg = *(box->nbeg)-1, iend = *(box->nend);
  int  jbeg = *(box->tbeg),   jend = *(box->tend);
  int  kbeg = *(box->bbeg),   kend = *(box->bend);
  int  VXn = VX1, VXt = VX2, VXb = VX3;
  int  MXn = MX1, MXt = MX2, MXb = MX3;
  int ni, nj;
  double gmm = GAMMA_EOS;
  double gmm1 = gmm - 1.0;
  double gmm1_inv  = 1.0/gmm1;
  double delta     = 1.e-7;
  double delta_inv = 1.0/delta;

  ARRAY_OFFSET (grid, ni, nj);

  INDEX_CYCLE (grid->dir, VXn, VXt, VXb);
  INDEX_CYCLE (grid->dir, MXn, MXt, MXb);

  #pragma acc parallel loop collapse(2) present(d, Dts, grid)
  for (k = kbeg; k <= kend; k++){
  for (j = jbeg; j <= jend; j++){

    long int offset = ni*(j + nj*k);

    double  * __restrict__ cmax    = &Dts->cmax     [offset];
    double  * __restrict__ SL      = &d->sweep.SL   [offset];
    double  * __restrict__ SR      = &d->sweep.SR   [offset];

    double um[NFLX];
    double fL[NFLX], fR[NFLX];

    #pragma acc loop private(um[:NFLX], fL[:NFLX], fR[:NFLX])
    for (i = ibeg; i <= iend; i++){
      int nv;
      double  scrh, vel2;
      double  a2, a, h;
      double alambda, lambda, eta;
      double s, c, hl, hr;
      double bmin, bmax, scrh1;
      double pL, pR;

      double * __restrict__ vL   = d->sweep.vL  [offset + i];
      double * __restrict__ vR   = d->sweep.vR  [offset + i];
      double * __restrict__ uL   = d->sweep.uL  [offset + i];
      double * __restrict__ uR   = d->sweep.uR  [offset + i];
      double * __restrict__ flux = d->sweep.flux[offset + i];

      double a2L = SoundSpeed2 (vL);
      double a2R = SoundSpeed2 (vR);

      PrimToCons (vL, uL);
      PrimToCons (vR, uR);

      Flux (vL, uL, fL, grid->dir);
      Flux (vR, uR, fR, grid->dir);

      pL = vL[PRS];
      pR = vR[PRS];


      s       = sqrt(vR[RHO]/vL[RHO]);
      um[RHO]  = vL[RHO]*s;
      s       = 1.0/(1.0 + s);
      c       = 1.0 - s;

      um[VX1] = s*vL[VX1] + c*vR[VX1];
      um[VX2] = s*vL[VX2] + c*vR[VX2];
      um[VX3] = s*vL[VX3] + c*vR[VX3];

      vel2 = um[VX1]*um[VX1] + um[VX2]*um[VX2] + um[VX3]*um[VX3];

      hl  = 0.5*(vL[VX1]*vL[VX1] + vL[VX2]*vL[VX2] + vL[VX3]*vL[VX3]);
      hl += a2L*gmm1_inv;

      hr = 0.5*(vR[VX1]*vR[VX1] + vR[VX2]*vR[VX2] + vR[VX3]*vR[VX3]);
      hr += a2R*gmm1_inv;

      h = s*hl + c*hr;

    /* ----------------------------------------------------
       1. the following should be  equivalent to

         scrh = dv[VX1]*dv[VX1] + dv[VX2]*dv[VX2] + dv[VX3]*dv[VX3];

         a2 = s*a2L + c*a2R + 0.5*gmm1*s*c*scrh;

         and therefore always positive.
       ---------------------------------------------------- */

      a2 = gmm1*(h - 0.5*vel2);
      a  = sqrt(a2);

    /* ----------------------------------------------------------------
       2. define non-zero components of conservative eigenvectors Rc,
          eigenvalues (lambda) and wave strenght eta = L.du
        ----------------------------------------------------------------  */

      #pragma acc loop seq
      NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]);

      /*  ---- (u - c_s)  ----  */

      SL[i] = um[VXn] - a;

      /*  ---- (u + c_s)  ----  */

      SR[i] = um[VXn] + a;

     /*  ----  get max eigenvalue  ----  */

      cmax[i] = fabs(um[VXn]) + a;

      NFLX_LOOP(nv) flux[nv] =   0.5*(fL[nv] + fR[nv]) - 0.5*cmax[i]*(uR[nv] - uL[nv]);

      #if DIMENSIONS > 1

      /* ---------------------------------------------
         3. use the HLL flux function if the interface
            lies within a strong shock.
            The effect of this switch is visible
            in the Mach reflection test.
        --------------------------------------------- */

      scrh  = fabs(vL[PRS] - vR[PRS]);
      scrh /= MIN(vL[PRS],vR[PRS]);

      if (scrh > 0.5 && (vR[VXn] < vL[VXn])){   /* -- tunable parameter -- */
        bmin = MIN(0.0, SL[i]);
        bmax = MAX(0.0, SR[i]);
        scrh1 = 1.0/(bmax - bmin);

        #pragma acc loop seq
        for (nv = 0; nv < NFLX; nv++){
          flux[nv]  = bmin*bmax*(uR[nv] - uL[nv])
                      + bmax*fL[nv] - bmin*fR[nv];
          flux[nv] *= scrh1;
        }
      }
      #endif  /* DIMENSIONS > 1 */

    } /* End loop on i */
  }} /* End loop on j,k */
}

Solution

Technically they are equivalent, but in practice different. What's happening is that the compiler will hoist the declaration of these arrays outside of the loops. This is standard practice for the compiler and happens before the OpenACC directives are applied. What should happen is that then these arrays are implicitly privatized within the scoping unit they are declared. However the compiler doesn't currently track this so the arrays are implicitly copied into the compute region as shared arrays. If you add the flag "-Minfo=accel", you'll see the compiler feedback messages indicating the implicit copies.

I have an open issue report requesting this support, TPR #31360, however it's been a challenge to implement so not in a released compiler as of yet. Hence until/if we can fix the behavior, you'll need to manually hoist the declaration of these arrays and then add them to a "private" clause.