OK. I'm getting my hands dirty with some code I took from cuSolver samples. I'm little experienced with C++, but I somehow managed to take the stuff I need from the original code.
The problem is when I try to execute it; as recommended from the reference manual I compile with:
nvcc -c att3_cus_lu.cpp -I/usr/local/cuda-8.0/targets/x86_64-linux/include
g++ -fopenmp -o res.out att3_cus_lu.o -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusolver -lcusparse
No problems till here; the output I get is always the same though:
step 1: set matrix (A) and right hand side vector (b)
step 2: prepare data on device
step 3: solve A*x = b
Error: LU factorization failed
INFO_VALUE = 2
timing: LU = 0.000208 sec
step 4: evaluate residual
|b - A*x| = NAN
|A| = 1.000000E+00
|x| = NAN
|b - A*x|/(|A|*|x|) = NAN
Whatever matrix or b vector I put the result is the same; the code can't factorize the matrix.
I've shown the INFO_VALUE whose value is always 2 on every execution; it's the value of the info
variable requested for the cuSolverDnDgetrf() function. In the cuSolver reference manual it says:
This function computes the LU factorization of a m×n matrix PA = LU where A is a m×n matrix, P is a permutation matrix, L is a lower triangular matrix with unit diagonal, and U is an upper triangular matrix. If LU factorization failed, i.e. matrix A(U) is singular, The output parameter devInfo=i indicates U(i,i) = 0.
In the code below I've put the identical matrix though, so no singular matrices are running around.
Here is the whole code; the pattern for main() cuda codes is repetitive: define host variables, cudaMemcpy them to the device, execute them on the device with cuda functions, cudaMemcpy them back to host, then go on with serial code until you repeat.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
#ifndef MAX
#define MAX(a,b) (a > b) ? a : b
#endif
void linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n,(double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
printf("INFO_VALUE = %d\n",h_info);
}
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (ipiv ) { checkCudaErrors(cudaFree(ipiv));}
}
//int main (int argc, char *argv[]) !!!
int main(void)
{
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 3; // number of rows of A
int colsA = 3; // number of columns of A
int lda = MAX(colsA, rowsA); // leading dimension in dense matrix
double *h_A = NULL; // dense matrix
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
int i, j, col, row, k;
h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*colsA);
h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
memset(h_A, 0., sizeof(double)*lda*colsA);
printf("step 1: set matrix (A) and right hand side vector (b)\n");
double mat[9] = {1.,0.,0.,0.,1.,0.,0.,0.,1.};
double bb[3] = {1., 1., 1.}; //RANDOM MATRICES 4 TESTING
for( row =0; row < rowsA ; row++ )
{
for( col = 0; col < colsA ; col++ )
{
h_A[row*lda + col] = mat[row];
}
}
for( k = 0; k < rowsA; k++ ){
h_b[k] = bb[k];
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));
printf("step 2: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));
printf("step 3: solve A*x = b \n");
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
printf("step 4: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowsA,
1,
colsA,
&minus_one,
d_A,
lda,
d_x,
rowsA,
&one,
d_r,
rowsA));
checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));
if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }
if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }
if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }
cudaDeviceReset();
return 0;
}
That's it. I know this is a lot of stuff; any help would be appreciated. I really don't get what I'm doing wrong with those matrices.
Let me know if I should add some other relevant info.
NB in the original code an external sparse matrix with the .mtx extension was requested and in the linearSolverLU function it was then turned to a dense matrix. I removed that stuff and now I'm directly trying to solve the linear system with a dense matrix instead.
The matrix you are submitting to cuSolver to factorize is invalid. The library is correctly reporting an error in the matrix entries. The culprit is this code:
for( row =0; row < rowsA ; row++ )
{
for( col = 0; col < colsA ; col++ )
{
h_A[row*lda + col] = mat[row];
}
}
That will produce h_A
containing { 1, 1, 1, 0, 0, 0, 0, 0, 0 }
which is singular, and (I assume) not what your intention was.