I have the following simple piece of code, consisting of 4 files:
//Data.h:
#ifndef DATA_H
#define DATA_H
constexpr int N=10000000;
namespace data{
float particles[N];
float outputArray[N];
}
#endif
//Process.h:
#ifndef PROCESS_H
#define PROCESS_H
template <typename ProcessImpl>
class Process{
public:
using Base_t = ProcessImpl;
Process(ProcessImpl arg):fProcessImpl(arg){}
void Get1(int N, float * outputArray) const;
void Get2(int N) const;
private:
ProcessImpl fProcessImpl;
};
template <class ProcessImpl>
void Process<ProcessImpl>::Get1(int N, float * outputArray) const
{
#pragma acc parallel loop gang vector present(outputArray)
for(int ind=0; ind < N; ++ind){outputCSArray[ind]=fProcessImpl.Get1(ind);}
}
template <class ProcessImpl>
void Process<ProcessImpl>::Get2(int N) const
{
#pragma acc parallel loop gang vector
for (int ind = 0u; ind < N; ++ind){fProcessImpl.Get2(ind);}
}
#endif
//ProcessImpl.h:
#ifndef PROCESSIMPL_H
#define PROCESSIMPL_H
#include "Data.h"
using namespace data;
class ProcessImpl
{
public:
inline float Get1(int ind, float * outputArray) const;
inline void Get2(int ind) const;
};
float ProcessImpl::Get1(int ind, float * outputArray) const
{
outputArray[ind]=particles[ind];
return particles[ind+1];
}
void ProcessImpl::Get2(int ind) const
{
particles[ind]=2*particles[ind];
}
#endif
//main.cpp:
#include <iostream>
#include "Data.h"
#include "Process.h"
#include "ProcessImpl.h"
#include <accelmath.h>
#include <openacc.h>
using namespace data;
using Process_t = Process<ProcessImpl>;
Process_t process = Process_t(typename Process_t::Base_t());
int main(int argc, char **argv)
{
#pragma acc data create(particles,outputArray)
{
#pragma acc parallel loop gang vector present(particles)
for(int i=0; i<N; ++i) particles[i]=static_cast<float>(i);
#pragma acc update host(particles)
for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
std::cout<<std::endl;
process.Get2(N);
#pragma acc update host(particles)
for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
std::cout<<std::endl;
}
return 0;
}
It works properly on CPU with PGI 19.4 compiler. But my task is to launch the code on GPU. I use PGI 19.4 + OpenAcc. Use a simple CMakeLists.txt file and the compile line (GPU Nvidia GeForce 650 Ti, compute capability 3.0):
cmake . -DCMAKE_C_COMPILER=pgcc -DCMAKE_CXX_COMPILER=pgc++
-DCMAKE_C_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"
-DCMAKE_CXX_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"
The compilation fails with:
> Scanning dependencies of target Test
[ 50%] Building CXX object CMakeFiles/Test.dir/main.cpp.o
main:
16, Generating create(_ZN4data11outputArrayE[:])
Generating present(_ZN4data9particlesE[:])
Generating create(_ZN4data9particlesE[:])
Generating Tesla code
18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
18, Generating update self(_ZN4data9particlesE[:])
23, Generating update self(_ZN4data9particlesE[:])
PGCC-W-0155-External and Static variables are not supported in acc routine -
_ZN4data9particlesE (/home/70-gaa/source/13OpenAccTest/main.cpp: 19)
ProcessImpl::Get2(int) const:
4, include "ProcessImpl.h"
18, Generating implicit acc routine seq
Process<ProcessImpl>::Get2(int) const:
3, include "Process.h"
25, Generating Tesla code
27, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
25, Generating implicit copyin(this[:])
PGCC/x86-64 Linux 19.4-0: compilation completed with warnings
[100%] Linking CXX executable Test
nvlink error : Undefined reference to '_ZNK11ProcessImpl4Get2Ei' in
'CMakeFiles/Test.dir/main.cpp.o'
pgacclnk: child process exit status 2: /opt/pgi/linux86-64-llvm/19.4/bin/pgnvd
CMakeFiles/Test.dir/build.make:83: recipe for target 'Test' failed
make[2]: *** [Test] Error 2
CMakeFiles/Makefile2:72: recipe for target 'CMakeFiles/Test.dir/all' failed
make[1]: *** [CMakeFiles/Test.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2
Using pggdecode, found that "_ZNK11ProcessImpl4Get2Ei" is the mangled name for ProcessImpl::Get2(int) const . I removed the inline keyword from ProcessImpl.h and tried to add copyin(process) to #pragma acc data create(particles,outputArray) in main(), but this did not help. Work on Fedora 23 with gcc 5.3.1.
In the full code I avoid the problem of multiple definition of arrays particles and outputArray including them in a single .cpp file, because OpenAcc does not allow to use the extern keyword. May be it is not good (if You know how to do better, please, advise), but it works.
The questions are:
How to properly pass the GPU version of arrays particles and outputArray to Get1() and Get2() in Process.h and make Get1() and Get2() in ProcessImpl.h work with the arrays allocated on GPU? And how to compile this code?
How OpenAcc allows to directly access the globally allocated arrays copied to GPU in the code in OpenAcc compute regions without passing a pointer to them as a parameter of the calling function?
Thank You.
The undefined reference is due to the device version of Get2 not getting created due to the error:
PGCC-W-0155-External and Static variables are not supported in acc routine _ZN4data9particlesE (/home/70-gaa/source/13OpenAccTest/main.cpp: 19)
The problem being that global variable accessed directly within device routines need a device version defined at link time so the linker can make the association between the two. One option would be to pass in "particles" as an argument, but the easier option is to put "particles" in a "declare create" directive.
The "declare" directive creates a data region that has the same scoping as the scoping unit where it's defined. So using it for variables with global scope, also puts in global scope on the device.
% cat Data.h
//Data.h:
#ifndef DATA_H
#define DATA_H
constexpr int N=10000000;
namespace data{
float particles[N];
float outputArray[N];
#pragma acc declare create(particles[:N])
}
#endif
% pgc++ -I. main.cpp -ta=tesla -Minfo=accel
main:
17, Generating create(_ZN4data11outputArrayE[:]) [if not already present]
Generating Tesla code
19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
19, Generating update self(_ZN4data9particlesE[:])
24, Generating update self(_ZN4data9particlesE[:])
ProcessImpl::Get2(int) const:
5, include "ProcessImpl.h"
19, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
Process<ProcessImpl>::Get2(int) const:
4, include "Process.h"
23, Generating Tesla code
25, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
23, Generating implicit copyin(this[:]) [if not already present]
% a.out
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198