I am trying to modify an existing particle method code using OpenACC to run on GPU. The existing code utilizes a 2D dynamic array of struct in c. I need to copy the structure(s) to GPU for further calculation. A code sample is given below:
typedef struct{
int *list; // it is list of particles in a given bucket
int count; // it is the total number of particles in the bucket
} structBucket;
typedef struct{
structBucket **bucket;
int numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;
structDomain domain;
// Allocate memory for **bucket
domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
int iX,iY, capacity;
domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );
for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);
// Calculate domain.bucket[iX][iY].count here using some logic
.
.
.
// Allocate memory for *list
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0)
{
domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
}
}
}
After reviewing various sources on the internet, I have come up with the following solution (which might be utterly wrong)"
// It is needed to create the memory for **bucket and *list on GPU.
#pragma acc enter data copyin(domain)
#pragma acc enter data copyin(domain.bucket)
#pragma acc enter data create(domain.bucket[0:domain.numberOfBuckets[XDIM]][0:domain.numberOfBuckets[YDIM]])
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
#pragma acc enter data create(domain.bucket[iX][iY].list[0:domain.bucket[iX][iY].count])
}
}
It is requested for an advice manual deep copy of **bucket and *list to GPU memory. Is my solution accurate? Could someone suggest improvements or a better solution for manual deep copy of said struct(s).
I am using PGI 19.4 compiler on Windows 10. Many thanks
Close. The only thing I'd do different is to not create "domain.bucket" and update the bucket's count so the device has this information. Also, since updates/copies are shallow, be sure to only update the list array or scalars in the structs. Otherwise you may overwrite device/host pointers. Here's an example. While I'm using Linux, other than the executable name, the code should the same.
% cat test.c
#include <stdio.h>
#include <stdlib.h>
typedef struct{
int *list; // it is list of particles in a given bucket
int count; // it is the total number of particles in the bucket
} structBucket;
typedef struct{
structBucket **bucket;
int numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;
#define XDIM 64
#define YDIM 64
int main() {
structDomain domain;
int iX,iY, capacity;
// Allocate memory for **bucket
domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );
for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);
// Calculate domain.bucket[iX][iY].count here using some logic
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
domain.bucket[iX][iY].count = iX*domain.numberOfBuckets[YDIM]+iY;
}}
#pragma acc enter data copyin(domain)
#pragma acc enter data create(domain.bucket[:domain.numberOfBuckets[XDIM]][:domain.numberOfBuckets[YDIM]])
// Allocate memory for *list
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
#pragma acc update device(domain.bucket[iX][iY].count)
if (capacity > 0)
{
domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
#pragma acc enter data create(domain.bucket[iX][iY].list[:capacity])
}
}
}
#pragma acc parallel loop gang collapse(2) present(domain)
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0) {
#pragma acc loop vector
for (int i = 0; i < capacity; ++i) {
domain.bucket[iX][iY].list[i] = i;
}
}
}}
for (iX = 0; iX < 5; iX++)
{
for (iY = 0; iY < 5; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0) {
#pragma acc update host(domain.bucket[iX][iY].list[:capacity])
printf("iX=%d iY=%d Cnt=%d\n\t",iX,iY,capacity);
for (int i = 0; i < capacity; ++i) {
printf("%d ",domain.bucket[iX][iY].list[i]);
}
printf("\n");
}
}}
exit(0);
}
% pgcc test.c -ta=tesla -Minfo=accel -V19.4
main:
40, Generating enter data copyin(domain)
41, Generating enter data create(domain.bucket[:domain.numberOfBuckets][:domain.numberOfBuckets])
49, Generating update device(domain.bucket->->count)
52, Generating enter data create(domain.bucket->->list[:capacity])
57, Generating present(domain)
Generating Tesla code
58, #pragma acc loop gang collapse(2) /* blockIdx.x */
60, /* blockIdx.x collapsed */
65, #pragma acc loop vector(128) /* threadIdx.x */
65, Accelerator restriction: size of the GPU copy of domain.bucket is unknown
Loop is parallelizable
78, Generating update self(domain.bucket->->list[:capacity])
% a.out
iX=0 iY=1 Cnt=1
0
iX=0 iY=2 Cnt=2
0 1
iX=0 iY=3 Cnt=3
0 1 2
iX=0 iY=4 Cnt=4
0 1 2 3
iX=1 iY=0 Cnt=5
0 1 2 3 4
iX=1 iY=1 Cnt=6
0 1 2 3 4 5
iX=1 iY=2 Cnt=7
0 1 2 3 4 5 6
iX=1 iY=3 Cnt=8
0 1 2 3 4 5 6 7
iX=1 iY=4 Cnt=9
0 1 2 3 4 5 6 7 8
iX=2 iY=0 Cnt=10
0 1 2 3 4 5 6 7 8 9
iX=2 iY=1 Cnt=11
0 1 2 3 4 5 6 7 8 9 10
iX=2 iY=2 Cnt=12
0 1 2 3 4 5 6 7 8 9 10 11
iX=2 iY=3 Cnt=13
0 1 2 3 4 5 6 7 8 9 10 11 12
iX=2 iY=4 Cnt=14
0 1 2 3 4 5 6 7 8 9 10 11 12 13
iX=3 iY=0 Cnt=15
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
iX=3 iY=1 Cnt=16
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
iX=3 iY=2 Cnt=17
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
iX=3 iY=3 Cnt=18
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
iX=3 iY=4 Cnt=19
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
iX=4 iY=0 Cnt=20
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
iX=4 iY=1 Cnt=21
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
iX=4 iY=2 Cnt=22
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
iX=4 iY=3 Cnt=23
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
iX=4 iY=4 Cnt=24
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23