cstructopenaccpgi

How to perform manual deep copy of 2D dynamic array of struct in C using OpenACC


I am trying to modify an existing particle method code using OpenACC to run on GPU. The existing code utilizes a 2D dynamic array of struct in c. I need to copy the structure(s) to GPU for further calculation. A code sample is given below:

typedef struct{
  int *list;  // it is list of particles in a given bucket
  int  count; // it is the total number of particles in the bucket
} structBucket;


typedef struct{
structBucket  **bucket;
int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;

structDomain domain;

// Allocate memory for **bucket
  domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
  int iX,iY, capacity;

  domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );

   for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++) 
      domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);

// Calculate domain.bucket[iX][iY].count here using some logic
.
.
.
// Allocate memory for *list
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;

        if (capacity > 0)
        {
          domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
        }
    }
  }

After reviewing various sources on the internet, I have come up with the following solution (which might be utterly wrong)"

// It is needed to create the memory for **bucket and *list on GPU. 
#pragma acc enter data copyin(domain)
#pragma acc enter data copyin(domain.bucket)
#pragma acc enter data create(domain.bucket[0:domain.numberOfBuckets[XDIM]][0:domain.numberOfBuckets[YDIM]])
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        #pragma acc enter data create(domain.bucket[iX][iY].list[0:domain.bucket[iX][iY].count])
    }
  }

It is requested for an advice manual deep copy of **bucket and *list to GPU memory. Is my solution accurate? Could someone suggest improvements or a better solution for manual deep copy of said struct(s).

I am using PGI 19.4 compiler on Windows 10. Many thanks


Solution

  • Close. The only thing I'd do different is to not create "domain.bucket" and update the bucket's count so the device has this information. Also, since updates/copies are shallow, be sure to only update the list array or scalars in the structs. Otherwise you may overwrite device/host pointers. Here's an example. While I'm using Linux, other than the executable name, the code should the same.

    % cat test.c
    
    #include <stdio.h>
    #include <stdlib.h>
    
    typedef struct{
      int *list;  // it is list of particles in a given bucket
      int  count; // it is the total number of particles in the bucket
    } structBucket;
    
    
    typedef struct{
    structBucket  **bucket;
    int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
    } structDomain;
    
    #define XDIM 64
    #define YDIM 64
    
    int main() {
    
      structDomain domain;
      int iX,iY, capacity;
    
    // Allocate memory for **bucket
      domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
    
      domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );
    
       for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
          domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);
    
    
    // Calculate domain.bucket[iX][iY].count here using some logic
      for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
      {
        for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
        {
           domain.bucket[iX][iY].count = iX*domain.numberOfBuckets[YDIM]+iY;
      }}
    #pragma acc enter data copyin(domain)
    #pragma acc enter data create(domain.bucket[:domain.numberOfBuckets[XDIM]][:domain.numberOfBuckets[YDIM]])
    // Allocate memory for *list
      for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
      {
        for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
        {
            capacity = domain.bucket[iX][iY].count;
    #pragma acc update device(domain.bucket[iX][iY].count)
            if (capacity > 0)
            {
              domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
    #pragma acc enter data create(domain.bucket[iX][iY].list[:capacity])
            }
        }
      }
    
    #pragma acc parallel loop gang collapse(2) present(domain)
      for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
      {
        for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
        {
            capacity = domain.bucket[iX][iY].count;
            if (capacity > 0) {
    #pragma acc loop vector
               for (int i = 0; i < capacity; ++i) {
                    domain.bucket[iX][iY].list[i] = i;
               }
            }
       }}
    
      for (iX = 0; iX < 5; iX++)
      {
        for (iY = 0; iY < 5; iY++)
        {
            capacity = domain.bucket[iX][iY].count;
            if (capacity > 0) {
    #pragma acc update host(domain.bucket[iX][iY].list[:capacity])
               printf("iX=%d iY=%d Cnt=%d\n\t",iX,iY,capacity);
               for (int i = 0; i < capacity; ++i) {
                    printf("%d ",domain.bucket[iX][iY].list[i]);
               }
               printf("\n");
            }
       }}
    
      exit(0);
    }
    % pgcc test.c -ta=tesla -Minfo=accel -V19.4
    main:
         40, Generating enter data copyin(domain)
         41, Generating enter data create(domain.bucket[:domain.numberOfBuckets][:domain.numberOfBuckets])
         49, Generating update device(domain.bucket->->count)
         52, Generating enter data create(domain.bucket->->list[:capacity])
         57, Generating present(domain)
             Generating Tesla code
             58, #pragma acc loop gang collapse(2) /* blockIdx.x */
             60,   /* blockIdx.x collapsed */
             65, #pragma acc loop vector(128) /* threadIdx.x */
         65, Accelerator restriction: size of the GPU copy of domain.bucket is unknown
             Loop is parallelizable
         78, Generating update self(domain.bucket->->list[:capacity])
    % a.out
    iX=0 iY=1 Cnt=1
            0
    iX=0 iY=2 Cnt=2
            0 1
    iX=0 iY=3 Cnt=3
            0 1 2
    iX=0 iY=4 Cnt=4
            0 1 2 3
    iX=1 iY=0 Cnt=5
            0 1 2 3 4
    iX=1 iY=1 Cnt=6
            0 1 2 3 4 5
    iX=1 iY=2 Cnt=7
            0 1 2 3 4 5 6
    iX=1 iY=3 Cnt=8
            0 1 2 3 4 5 6 7
    iX=1 iY=4 Cnt=9
            0 1 2 3 4 5 6 7 8
    iX=2 iY=0 Cnt=10
            0 1 2 3 4 5 6 7 8 9
    iX=2 iY=1 Cnt=11
            0 1 2 3 4 5 6 7 8 9 10
    iX=2 iY=2 Cnt=12
            0 1 2 3 4 5 6 7 8 9 10 11
    iX=2 iY=3 Cnt=13
            0 1 2 3 4 5 6 7 8 9 10 11 12
    iX=2 iY=4 Cnt=14
            0 1 2 3 4 5 6 7 8 9 10 11 12 13
    iX=3 iY=0 Cnt=15
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
    iX=3 iY=1 Cnt=16
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    iX=3 iY=2 Cnt=17
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
    iX=3 iY=3 Cnt=18
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
    iX=3 iY=4 Cnt=19
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
    iX=4 iY=0 Cnt=20
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
    iX=4 iY=1 Cnt=21
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
    iX=4 iY=2 Cnt=22
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
    iX=4 iY=3 Cnt=23
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
    iX=4 iY=4 Cnt=24
            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23