How to perform manual deep copy of 2D dynamic array of struct in C using OpenACC

I am trying to modify an existing particle method code using OpenACC to run on GPU. The existing code utilizes a 2D dynamic array of struct in c. I need to copy the structure(s) to GPU for further calculation. A code sample is given below:

typedef struct{
  int *list;  // it is list of particles in a given bucket
  int  count; // it is the total number of particles in the bucket
} structBucket;


typedef struct{
structBucket  **bucket;
int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;

structDomain domain;

// Allocate memory for **bucket
  domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
  int iX,iY, capacity;

  domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );

   for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++) 
      domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);

// Calculate domain.bucket[iX][iY].count here using some logic
.
.
.
// Allocate memory for *list
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;

        if (capacity > 0)
        {
          domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
        }
    }
  }

After reviewing various sources on the internet, I have come up with the following solution (which might be utterly wrong)"

// It is needed to create the memory for **bucket and *list on GPU. 
#pragma acc enter data copyin(domain)
#pragma acc enter data copyin(domain.bucket)
#pragma acc enter data create(domain.bucket[0:domain.numberOfBuckets[XDIM]][0:domain.numberOfBuckets[YDIM]])
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        #pragma acc enter data create(domain.bucket[iX][iY].list[0:domain.bucket[iX][iY].count])
    }
  }

It is requested for an advice manual deep copy of **bucket and *list to GPU memory. Is my solution accurate? Could someone suggest improvements or a better solution for manual deep copy of said struct(s).

I am using PGI 19.4 compiler on Windows 10. Many thanks

Solution

Close. The only thing I'd do different is to not create "domain.bucket" and update the bucket's count so the device has this information. Also, since updates/copies are shallow, be sure to only update the list array or scalars in the structs. Otherwise you may overwrite device/host pointers. Here's an example. While I'm using Linux, other than the executable name, the code should the same.

% cat test.c

#include <stdio.h>
#include <stdlib.h>

typedef struct{
  int *list;  // it is list of particles in a given bucket
  int  count; // it is the total number of particles in the bucket
} structBucket;


typedef struct{
structBucket  **bucket;
int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;

#define XDIM 64
#define YDIM 64

int main() {

  structDomain domain;
  int iX,iY, capacity;

// Allocate memory for **bucket
  domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;

  domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );

   for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
      domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);


// Calculate domain.bucket[iX][iY].count here using some logic
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
       domain.bucket[iX][iY].count = iX*domain.numberOfBuckets[YDIM]+iY;
  }}
#pragma acc enter data copyin(domain)
#pragma acc enter data create(domain.bucket[:domain.numberOfBuckets[XDIM]][:domain.numberOfBuckets[YDIM]])
// Allocate memory for *list
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
#pragma acc update device(domain.bucket[iX][iY].count)
        if (capacity > 0)
        {
          domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
#pragma acc enter data create(domain.bucket[iX][iY].list[:capacity])
        }
    }
  }

#pragma acc parallel loop gang collapse(2) present(domain)
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
        if (capacity > 0) {
#pragma acc loop vector
           for (int i = 0; i < capacity; ++i) {
                domain.bucket[iX][iY].list[i] = i;
           }
        }
   }}

  for (iX = 0; iX < 5; iX++)
  {
    for (iY = 0; iY < 5; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
        if (capacity > 0) {
#pragma acc update host(domain.bucket[iX][iY].list[:capacity])
           printf("iX=%d iY=%d Cnt=%d\n\t",iX,iY,capacity);
           for (int i = 0; i < capacity; ++i) {
                printf("%d ",domain.bucket[iX][iY].list[i]);
           }
           printf("\n");
        }
   }}

  exit(0);
}
% pgcc test.c -ta=tesla -Minfo=accel -V19.4
main:
     40, Generating enter data copyin(domain)
     41, Generating enter data create(domain.bucket[:domain.numberOfBuckets][:domain.numberOfBuckets])
     49, Generating update device(domain.bucket->->count)
     52, Generating enter data create(domain.bucket->->list[:capacity])
     57, Generating present(domain)
         Generating Tesla code
         58, #pragma acc loop gang collapse(2) /* blockIdx.x */
         60,   /* blockIdx.x collapsed */
         65, #pragma acc loop vector(128) /* threadIdx.x */
     65, Accelerator restriction: size of the GPU copy of domain.bucket is unknown
         Loop is parallelizable
     78, Generating update self(domain.bucket->->list[:capacity])
% a.out
iX=0 iY=1 Cnt=1
        0
iX=0 iY=2 Cnt=2
        0 1
iX=0 iY=3 Cnt=3
        0 1 2
iX=0 iY=4 Cnt=4
        0 1 2 3
iX=1 iY=0 Cnt=5
        0 1 2 3 4
iX=1 iY=1 Cnt=6
        0 1 2 3 4 5
iX=1 iY=2 Cnt=7
        0 1 2 3 4 5 6
iX=1 iY=3 Cnt=8
        0 1 2 3 4 5 6 7
iX=1 iY=4 Cnt=9
        0 1 2 3 4 5 6 7 8
iX=2 iY=0 Cnt=10
        0 1 2 3 4 5 6 7 8 9
iX=2 iY=1 Cnt=11
        0 1 2 3 4 5 6 7 8 9 10
iX=2 iY=2 Cnt=12
        0 1 2 3 4 5 6 7 8 9 10 11
iX=2 iY=3 Cnt=13
        0 1 2 3 4 5 6 7 8 9 10 11 12
iX=2 iY=4 Cnt=14
        0 1 2 3 4 5 6 7 8 9 10 11 12 13
iX=3 iY=0 Cnt=15
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
iX=3 iY=1 Cnt=16
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
iX=3 iY=2 Cnt=17
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
iX=3 iY=3 Cnt=18
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
iX=3 iY=4 Cnt=19
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
iX=4 iY=0 Cnt=20
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
iX=4 iY=1 Cnt=21
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
iX=4 iY=2 Cnt=22
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
iX=4 iY=3 Cnt=23
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
iX=4 iY=4 Cnt=24
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23