carraysopenaccpgcc

OpenACC: How to select an array on device from a pointer to corresponding array on host


I am trying to offload an existing C code to GPU using OpenACC. In the original CPU code, many times, it is required to select a data array based on the value of some parameter. A sample CPU code is given below:

#include <stdio.h>
#include <stdlib.h>


void selectArray (int **F, int a);

#define NN 1000
int *C, *D, *E;

int main(void)
{
    int *F, a = 10; // a is the parameter used to select the array

    C = (int *)malloc(NN * sizeof(int));
    D = (int *)malloc(NN * sizeof(int));
    E = (int *)malloc(NN * sizeof(int));

    for (int i = 0; i < NN; i++)
    {
        C[i] = 10;
        D[i] = 20;
    }

    selectArray(&F, a);

    for (int i = 0; i < NN; i++)
    {
       E[i] = 2 * F[i];
    }

    for (int i = 0; i < 200; i++)
       printf("%d %d \n", i, E[i]);

    return 0;

}

void selectArray(int **F, int a)
{
    if (a <= 15)
    {
        (*F) = C;
    }
    else
    {
        (*F) = D;
    }
}

For OpenACC version of code, arrays C and D are already present on GPU and further calculations need to be done on the array selected on the basis of parameter a.

#include <stdio.h>
#include <stdlib.h>

void selectArray(int **F, int a);

#define NN 1000
int *C, *D, *E;

int main(void)
{
    int *F, a = 10; // a is the parameter used to select the array

    C = (int *)malloc(NN * sizeof(int));
    D = (int *)malloc(NN * sizeof(int));
    E = (int *)malloc(NN * sizeof(int));

#pragma acc enter data create(C[:NN], D[:NN])
#pragma acc parallel loop present(C[:NN], D[:NN])
    for (int i = 0; i < NN; i++)
    {
        C[i] = 10;
        D[i] = 20;
    }

    selectArray(&F, a);

#pragma acc enter data copyin(F[:1]) create(E[:NN])

// Here, I cannot figure out how to point F to a selected array (C or D) on the device

#pragma acc parallel loop
        for (int i = 0; i < NN; i++)
        {
            E[i] = 2 * F[i]; //further calculations on selected array on GPU
        }
    }

#pragma acc exit data delete (C[:NN], D[:NN], F)copyout(E[:200])
    for (int i = 0; i < 200; i++)
        printf("%d %d \n", i, E[i]);

    return 0;
}

void selectArray(int **F, int a)
{
    if (a <= 15)
    {
        (*F) = C;
    }
    else
    {
        (*F) = D;
    }
}

In actual code, arrays C and D are calculated in different functions and not in the main function. I have tried searching the internet to solve this issue but I could not find any related example. I am using PGI 19.10 compiler on Windows 10. Help in this regard is requested. Thanks in advance


Solution

  • You just need to add a "present(F)" on the parallel loop and not include "F" in a data region. Since the acc present table look-up is done by host address, if "F" matches an existing host address present on the device, it will associate "F" the the same device address. However, don't put "F" in it's own data region and in particular, don't delete it since it would cause multiple frees on the same device array.

    I modified your code a bit so that "F" points to "C" in one case and "D" in the second.

    % cat test.c
    #include <stdio.h>
    #include <stdlib.h>
    
    void selectArray(int **F, int a);
    
    #define NN 1000
    int *C, *D, *E;
    
    int main(void)
    {
        int *F, a = 10; // a is the parameter used to select the array
    
        C = (int *)malloc(NN * sizeof(int));
        D = (int *)malloc(NN * sizeof(int));
        E = (int *)malloc(NN * sizeof(int));
    
    #pragma acc enter data create(C[:NN], D[:NN], E[:NN])
    #pragma acc parallel loop present(C[:NN], D[:NN])
        for (int i = 0; i < NN; i++)
        {
            C[i] = 10;
            D[i] = 20;
        }
    
        for (a=10;a<=20;a+=10) {
           selectArray(&F, a);
    
    #pragma acc parallel loop present(E,F)
           for (int i = 0; i < NN; i++)
           {
              E[i] = 2 * F[i]; //further calculations on selected array on GPU
           }
    #pragma acc update host(E[:20])
           for (int i = 0; i < 20; i++)
           {
              printf("a=%d E[%d]=%d \n", a, i, E[i]);
           }
         }
    #pragma acc exit data delete(C, D, E)
        return 0;
    }
    
    void selectArray(int **F, int a)
    {
        if (a <= 15)
        {
            (*F) = C;
        }
        else
        {
            (*F) = D;
        }
    }
    % pgcc -ta=tesla -Minfo=accel test.c; a.out
    main:
         17, Generating enter data create(D[:1000],E[:1000],C[:1000])
         18, Generating present(D[:1000],C[:1000])
             Generating Tesla code
             19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         28, Generating present(E[:],F[:])
             Generating Tesla code
             29, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         34, Generating update self(E[:20])
         39, Generating exit data delete(E[:1],D[:1],C[:1])
    a=10 E[0]=20
    a=10 E[1]=20
    a=10 E[2]=20
    a=10 E[3]=20
    a=10 E[4]=20
    a=10 E[5]=20
    a=10 E[6]=20
    a=10 E[7]=20
    a=10 E[8]=20
    a=10 E[9]=20
    a=10 E[10]=20
    a=10 E[11]=20
    a=10 E[12]=20
    a=10 E[13]=20
    a=10 E[14]=20
    a=10 E[15]=20
    a=10 E[16]=20
    a=10 E[17]=20
    a=10 E[18]=20
    a=10 E[19]=20
    a=20 E[0]=40
    a=20 E[1]=40
    a=20 E[2]=40
    a=20 E[3]=40
    a=20 E[4]=40
    a=20 E[5]=40
    a=20 E[6]=40
    a=20 E[7]=40
    a=20 E[8]=40
    a=20 E[9]=40
    a=20 E[10]=40
    a=20 E[11]=40
    a=20 E[12]=40
    a=20 E[13]=40
    a=20 E[14]=40
    a=20 E[15]=40
    a=20 E[16]=40
    a=20 E[17]=40
    a=20 E[18]=40
    a=20 E[19]=40