I am trying to wrap my head around combining openacc with pointers to structs containing dynamically allocated members. The code below fails with
Failing in Thread:1 call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
when compiled using nvc ("nvc 20.9-0 LLVM 64-bit target on x86-64 Linux -tp haswell"). As far as I can tell I am following the approach suggested eg in the OpenACC 'getting started' guide. But somehow presumably the pointers don't stick (?) on the device. Does anyone know what goes wrong here?
#include <stdlib.h>
#include <stdio.h>
typedef struct grid
{
int N;
double *X;
} grid;
void allocate(grid* g, int N)
{
g->N = N;
g->X = (double*) malloc(sizeof(double) * g->N);
#pragma acc enter data create(g[0:1])
#pragma acc enter data create(g->X[0:N])
}
void release(grid* g)
{
#pragma acc exit data delete(g->X[0:g->N])
#pragma acc exit data delete(g[0:1])
free(g->X);
}
void fill(grid * g)
{
int i;
#pragma acc parallel loop
for (i = 0; i < g->N; i++)
{
g->X[i] = 42; // the cuprit, commenting this removes the error too
}
}
int main()
{
grid g;
allocate(&g, 10);
fill(&g);
release(&g);
return 0;
}```
From the compiler feedback messages you'll see something like:
fill:
32, Accelerator restriction: size of the GPU copy of g is unknown
Generating Tesla code
32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
32, Generating implicit copyin(g) [if not already present]
37, Generating update self(g->X[:g->N])
The problem being that the compiler can't implicitly copy aggregate types with dynamic data members so you need to add a "present(g)" to indicate that g is already the device.
Also, you'll want to copyin g in order to get the value of N on the device and no need to include the array shape in the exit data delete directive. For example:
% cat test.c
#include <stdlib.h>
#include <stdio.h>
typedef struct grid
{
int N;
double *X;
} grid;
void allocate(grid* g, int N)
{
g->N = N;
g->X = (double*) malloc(sizeof(double) * g->N);
#pragma acc enter data copyin(g[0:1])
#pragma acc enter data create(g->X[0:N])
}
void release(grid* g)
{
#pragma acc exit data delete(g->X)
#pragma acc exit data delete(g)
free(g->X);
}
void fill(grid * g)
{
int i;
#pragma acc parallel loop present(g)
for (i = 0; i < g->N; i++)
{
g->X[i] = 42; // the cuprit, commenting this removes the error too
}
#pragma acc update self(g->X[:g->N])
for (i = 0; i < 4; i++)
{
printf("%d : %f \n",i,g->X[i]);
}
}
int main()
{
grid g;
allocate(&g, 10);
fill(&g);
release(&g);
return 0;
}
% nvc -acc test.c -Minfo=accel -V20.9 ; a.out
allocate:
17, Generating enter data copyin(g[:1])
Generating enter data create(g->X[:N])
release:
24, Generating exit data delete(g[:1],g->X[:1])
fill:
32, Generating present(g[:1])
Generating Tesla code
32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
37, Generating update self(g->X[:g->N])
0 : 42.000000
1 : 42.000000
2 : 42.000000
3 : 42.000000