[SOLVED] Memory error when linking g++ with pgi compiled code using OpenACC and cublasDgemv

In order to use my GPU with OpenACC and cublas in an application, which is comiled with g++, I setup a small testexample. Therefor I created the files:

main.cpp
pgiCudaCode.h
pgiCudaCode.cpp

My testsystem ist an Ubuntu 18.04 linux with g++ version 7.5.0 and the pgc++ version 19.10-0 with a Nvidia GTX1070 card.

The file pgiCudaCode.cpp got some impementations for a general matrix-verctor multiply with openACC and cublas. This file is complied with the PGI compiler and the command:

pgc++ -fast -Minfo=opt -ta:tesla:cc60,managed,nordc -Mcudalib=cublas -Minfo=accel -fPIC pgiCudaCode.cpp -c pgiCudaCode.o

I do use the option nordc in order to operate with g++.

The main-file is compiled and linked with g++:

g++ -fPIC pgiCudaCode.o -L/opt/pgi/linux86-64/19.10/lib/ -laccapi -laccg -laccn -laccg2 -lpgiman -ldl -lcudadevice -lcudapgi -lomp -lnuma -lpthread -lnspgc -lpgc -lm -lgcc -lc -lgcc -lpgmath -lblas -lpgatm -lpgkomp -L/opt/pgi/linux86-64/2019/cuda/10.1/lib64/ -lcublas -lcublasLt -lcudart main.cpp -o mainGCC

After setting up these exports on my Ubuntu 18.04

export LD_LIBRARY_PATH="/opt/pgi/linux86-64/19.10/lib/:$LD_LIBRARY_PATH"
export LD_LIBRARY_PATH="/opt/pgi/linux86-64/2019/cuda/10.1/lib64/:$LD_LIBRARY_PATH"

I can run the Executable mainGCC an get the following output:

./mainGCC 
Vector 1:
1
1

Matrix:
        1       3
        2       4

matrix*vec pure openACC:
4
6

matrix*vec cublas with internal allocation:
4
6

matrix*vec cublas without internal allocation:
Failing in Thread:1
call to cuMemcpyDtoHAsync returned error 700: Illegal address during kernel execution

Failing in Thread:1
call to cuMemFreeHost returned error 700: Illegal address during kernel execution

I do not get this error, when linking and compiling main.cpp with the pgi compiler:

pgc++ -fast -Minfo=opt -ta:tesla:cc60,managed,nordc -Mcudalib=cublas -Minfo=accel -fPIC pgiCudaCode.o main.cpp -o mainPGI

Here the output of mainPGI is correct:

Vector 1:
1
1

Matrix:
        1       3
        2       4

matrix*vec pure openACC:
4
6

matrix*vec cublas with internal allocation:
4
6

matrix*vec cublas without internal allocation:
4
6

So the interesting parts are:

When using g++ allocated memory in the function matmulPureOpenACC it does not cause any problem, while the function matmul using cublas has an error.
When you use pgi allocated memory in the function matmul_internAlloc, cublas can operate with g++.

This brings me to my question:

How can i prevent this error, using g++ allocated memory in the function matmul?

Here are the corresponding .cpp and .h files.

main.cpp:

#include <iostream>
#include "pgiCudaCode.h"

void printVec(int N, double* vec)
{
    for(int i = 0; i < N; i++)
    {
        std::cout << vec[i] << std::endl;
    }
}

void printMatrix(int N, double* matr)
{
    for(int i = 0; i < N; i++)
    {
        for(int j = 0; j < N; j++)
        {
            std::cout << '\t' << matr[i + j * N];
        }
        std::cout << std::endl;
    }
}

int main()
{
    int N        = 2;
    double* vec1 = new double[N];
    vec1[0]      = 1.0;
    vec1[1]      = 1.0;
    double* vec2 = new double[N];
    vec2[0]      = 0.0;
    vec2[1]      = 0.0;
    double* matr = new double[N*N];
    matr[0]      = 1.0;
    matr[1]      = 2.0;
    matr[2]      = 3.0;
    matr[3]      = 4.0;

    std::cout << "Vector 1:" << std::endl;
    printVec(N, vec1);
    std::cout << std::endl;

    std::cout << "Matrix:" << std::endl;
    printMatrix(N, matr);
    std::cout << std::endl;

    std::cout << "matrix*vec pure openACC:" << std::endl;
    matmulPureOpenACC(N, matr, vec1, vec2);
    printVec(N, vec2);
    std::cout << std::endl;

    vec2[0]      = 0.0;
    vec2[1]      = 0.0;

    std::cout << "matrix*vec cublas with internal allocation:" << std::endl;
    matmul_internAlloc(N, matr, vec1, vec2);
    printVec(N, vec2);
    std::cout << std::endl;

    vec2[0]      = 0.0;
    vec2[1]      = 0.0;

    std::cout << "matrix*vec cublas without internal allocation:" << std::endl;
    matmul(N, matr, vec1, vec2);
    printVec(N, vec2);
    std::cout << std::endl;

    delete [] vec1;
    delete [] vec2;
    delete [] matr;
    return 0;
}

pgiCudaCode.h:

#ifndef PGICUDACODE_H
#define PGICUDACODE_H


bool matmul(int n, const double* matr, const double* b, double* c);

bool matmul_internAlloc(int n, const double* matr, const double* b, double* c);

bool matmulPureOpenACC(int n, const double* matr, const double* b, double* c);

#endif

pgiCudaCode.cpp:

#include <iostream>
#include <cublas_v2.h>

void matmul(int n, const double* matr, const double* b, double* c)
{
    #pragma acc data pcopyin(n , matr[0:n*n], b[0:n]) pcopy(c[0:n])
    {
        #pragma acc host_data use_device(matr, b, c)
        {
            cublasHandle_t handle;
            cublasStatus_t stat = cublasCreate(&handle);
            if ( CUBLAS_STATUS_SUCCESS != stat ) {
                std::cerr<<"CUBLAS initialization failed"<<std::endl;
            }

            if ( CUBLAS_STATUS_SUCCESS == stat )
            {
                const double alpha = 1.0;
                const double beta  = 1.0;
                stat = cublasDgemv_v2(handle, CUBLAS_OP_N, n,n, &alpha, matr, n, b, 1, &beta, c, 1);
                if (stat != CUBLAS_STATUS_SUCCESS) {
                    std::cerr<<"cublasDgemm failed"<<std::endl;
                }
            }
            cublasDestroy(handle);
        }
    }
}

void matmul_internAlloc(int n2, const double* matr2, const double* b2, double* c2)
{
    int n         = n2;
    double* matr  = new double[n*n];
    double* b     = new double[n];
    double* c     = new double[n];

    std::copy(&matr2[0], &matr2[n*n], &matr[0]);
    std::copy(&b2[0], &b2[n], &b[0]);
    std::copy(&c2[0], &c2[n], &c[0]);

    #pragma acc data pcopyin(n , matr[0:n*n], b[0:n]) pcopy(c[0:n])
    {
        #pragma acc host_data use_device(matr, b, c)
        {
            cublasHandle_t handle;
            cublasStatus_t stat = cublasCreate(&handle);
            if ( CUBLAS_STATUS_SUCCESS != stat ) {
                std::cerr<<"CUBLAS initialization failed"<<std::endl;
            }

            if ( CUBLAS_STATUS_SUCCESS == stat )
            {
                const double alpha = 1.0;
                const double beta  = 1.0;
                stat = cublasDgemv_v2(handle, CUBLAS_OP_N, n,n, &alpha, matr, n, b, 1, &beta, c, 1);
                if (stat != CUBLAS_STATUS_SUCCESS) {
                    std::cerr<<"cublasDgemm failed"<<std::endl;
                }
            }
            cublasDestroy(handle);
        }
    }
    std::copy(&c[0], &c[n], &c2[0]);
    delete [] matr;
    delete [] b;
    delete [] c;
}

void matmulPureOpenACC(int n, const double* matr, const double* b, double* c)
{
    #pragma acc data pcopyin(n, matr[0:n*n], b[0:n]) pcopy(c[0:n])
    {
        #pragma acc parallel loop
        for(int i = 0; i < n; i++)
        {
            #pragma acc loop seq
            for(int j = 0; j < n; j++)
            {
                c[i] += matr[i + j*n]*b[j];
            }
        }
    }
}