c++cudalinear-algebracusolver

cuSolver sample with cuSolverDnDgetrf not working


OK. I'm getting my hands dirty with some code I took from cuSolver samples. I'm little experienced with C++, but I somehow managed to take the stuff I need from the original code.

The problem is when I try to execute it; as recommended from the reference manual I compile with:

nvcc -c att3_cus_lu.cpp -I/usr/local/cuda-8.0/targets/x86_64-linux/include
g++ -fopenmp -o res.out att3_cus_lu.o -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusolver -lcusparse

No problems till here; the output I get is always the same though:

step 1: set matrix (A) and right hand side vector (b)
step 2: prepare data on device
step 3: solve A*x = b
Error: LU factorization failed
INFO_VALUE = 2
timing: LU =   0.000208 sec
step 4: evaluate residual
|b - A*x| = NAN
|A| = 1.000000E+00
|x| = NAN
|b - A*x|/(|A|*|x|) = NAN

Whatever matrix or b vector I put the result is the same; the code can't factorize the matrix. I've shown the INFO_VALUE whose value is always 2 on every execution; it's the value of the info variable requested for the cuSolverDnDgetrf() function. In the cuSolver reference manual it says:

This function computes the LU factorization of a m×n matrix PA = LU where A is a m×n matrix, P is a permutation matrix, L is a lower triangular matrix with unit diagonal, and U is an upper triangular matrix. If LU factorization failed, i.e. matrix A(U) is singular, The output parameter devInfo=i indicates U(i,i) = 0.

In the code below I've put the identical matrix though, so no singular matrices are running around.

Here is the whole code; the pattern for main() cuda codes is repetitive: define host variables, cudaMemcpy them to the device, execute them on the device with cuda functions, cudaMemcpy them back to host, then go on with serial code until you repeat.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include <cuda_runtime.h>

#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"

#include "helper_cusolver.h"

#ifndef MAX
    #define MAX(a,b) (a > b) ? a : b
#endif

 void linearSolverLU(
    cusolverDnHandle_t handle,
    int n,
    const double *Acopy,
    int lda,
    const double *b,
    double *x)
{
    int bufferSize = 0;
    int *info = NULL;
    double *buffer = NULL;
    double *A = NULL;
    int *ipiv = NULL; // pivoting sequence
    int h_info = 0;
    double start, stop;
    double time_solve;

    checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n,(double*)Acopy, lda, &bufferSize));

checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));


// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));

start = second();

checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));

if ( 0 != h_info ){
    fprintf(stderr, "Error: LU factorization failed\n");
printf("INFO_VALUE = %d\n",h_info);
}

checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();

time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);

if (info  ) { checkCudaErrors(cudaFree(info  )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A     ) { checkCudaErrors(cudaFree(A)); }
if (ipiv  ) { checkCudaErrors(cudaFree(ipiv));}

}

//int main (int argc, char *argv[]) !!!
int main(void)
{
    cusolverDnHandle_t handle = NULL;
    cublasHandle_t cublasHandle = NULL; // used in residual evaluation
    cudaStream_t stream = NULL;

    int rowsA = 3; // number of rows of A
    int colsA = 3; // number of columns of A
    int lda = MAX(colsA, rowsA); // leading dimension in dense matrix

    double *h_A = NULL; // dense matrix
    double *h_x = NULL; // a copy of d_x
    double *h_b = NULL; // b = ones(m,1)
    double *h_r = NULL; // r = b - A*x, a copy of d_r

    double *d_A = NULL; // a copy of h_A
    double *d_x = NULL; // x = A \ b
    double *d_b = NULL; // a copy of h_b
    double *d_r = NULL; // r = b - A*x

    // the constants are used in residual evaluation, r = b - A*x
    const double minus_one = -1.0;
    const double one = 1.0;

    double x_inf = 0.0;
    double r_inf = 0.0;
    double A_inf = 0.0;
    int errors = 0;

    int i, j, col, row, k;

    h_A = (double*)malloc(sizeof(double)*lda*colsA);
    h_x = (double*)malloc(sizeof(double)*colsA);
    h_b = (double*)malloc(sizeof(double)*rowsA);
    h_r = (double*)malloc(sizeof(double)*rowsA);
    assert(NULL != h_A);
    assert(NULL != h_x);
    assert(NULL != h_b);
    assert(NULL != h_r);

    memset(h_A, 0., sizeof(double)*lda*colsA);

    printf("step 1: set matrix (A) and right hand side vector (b)\n");

    double mat[9] = {1.,0.,0.,0.,1.,0.,0.,0.,1.};
    double bb[3] = {1., 1., 1.}; //RANDOM MATRICES 4 TESTING

    for( row =0; row < rowsA ; row++ )
    {

        for( col = 0; col < colsA ; col++ )
        {
            h_A[row*lda + col] = mat[row];
        }
    }

    for( k = 0; k < rowsA; k++ ){
    h_b[k] = bb[k];
    }

    checkCudaErrors(cusolverDnCreate(&handle));
    checkCudaErrors(cublasCreate(&cublasHandle));
    checkCudaErrors(cudaStreamCreate(&stream));

    checkCudaErrors(cusolverDnSetStream(handle, stream));
    checkCudaErrors(cublasSetStream(cublasHandle, stream));


    checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
    checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
    checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));

    printf("step 2: prepare data on device\n");
    checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));

    printf("step 3: solve A*x = b \n");

    linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);

    printf("step 4: evaluate residual\n");
    checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rowsA, cudaMemcpyDeviceToDevice));

    // r = b - A*x
    checkCudaErrors(cublasDgemm_v2(
        cublasHandle,
        CUBLAS_OP_N,
        CUBLAS_OP_N,
        rowsA,
        1,
        colsA,
        &minus_one,
        d_A,
        lda,
        d_x,
        rowsA,
        &one,
        d_r,
        rowsA));

    checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));

    x_inf = vec_norminf(colsA, h_x);
    r_inf = vec_norminf(rowsA, h_r);
    A_inf = mat_norminf(rowsA, colsA, h_A, lda);

    printf("|b - A*x| = %E \n", r_inf);
    printf("|A| = %E \n", A_inf);
    printf("|x| = %E \n", x_inf);
    printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));

    if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
    if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
    if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }

    if (h_A) { free(h_A); }
    if (h_x) { free(h_x); }
    if (h_b) { free(h_b); }
    if (h_r) { free(h_r); }

    if (d_A) { checkCudaErrors(cudaFree(d_A)); }
    if (d_x) { checkCudaErrors(cudaFree(d_x)); }
    if (d_b) { checkCudaErrors(cudaFree(d_b)); }
    if (d_r) { checkCudaErrors(cudaFree(d_r)); }

    cudaDeviceReset();

    return 0;
}

That's it. I know this is a lot of stuff; any help would be appreciated. I really don't get what I'm doing wrong with those matrices.

Let me know if I should add some other relevant info.

NB in the original code an external sparse matrix with the .mtx extension was requested and in the linearSolverLU function it was then turned to a dense matrix. I removed that stuff and now I'm directly trying to solve the linear system with a dense matrix instead.


Solution

  • The matrix you are submitting to cuSolver to factorize is invalid. The library is correctly reporting an error in the matrix entries. The culprit is this code:

    for( row =0; row < rowsA ; row++ )
    {
        for( col = 0; col < colsA ; col++ )
        {
            h_A[row*lda + col] = mat[row];
        }
    }
    

    That will produce h_A containing { 1, 1, 1, 0, 0, 0, 0, 0, 0 } which is singular, and (I assume) not what your intention was.