c++cudacublas

Eigen Vectors mismatch by cuBLAS and Eigen lib


I want to do the EVD wth this 4x4 covariance matrix:

cuDoubleComplex m_cov_[16] = {
        make_cuDoubleComplex(2.0301223848037391, 3.4235008150792548e-17),
        make_cuDoubleComplex(1.0365842528472908, -2.1028119220635375),
        make_cuDoubleComplex(2.7516978261134084, 0.95059712173944222),
        make_cuDoubleComplex(-1.350157109070875, -1.1815219722269694),
        make_cuDoubleComplex(1.0365842528472908, 2.1028119220635375),
        make_cuDoubleComplex(2.7073859851827988, 8.7392491301242207e-17),
        make_cuDoubleComplex(0.42038828834095354, 3.3356003818061963),
        make_cuDoubleComplex(0.53443422887585779, -2.0017874620940646),
        make_cuDoubleComplex(2.7516978261134084, -0.95059712173944222),
        make_cuDoubleComplex(0.42038828834095354, -3.3356003818061963),
        make_cuDoubleComplex(4.1748595442022722, 2.0100622219496206e-17),
        make_cuDoubleComplex(-2.3832926547827333, -0.9692696339061293),
        make_cuDoubleComplex(-1.350157109070875, 1.1815219722269694),
        make_cuDoubleComplex(0.53443422887585779, 2.0017874620940646),
        make_cuDoubleComplex(-2.3832926547827333, 0.9692696339061293),
        make_cuDoubleComplex(1.5855784922744534, -1.5877902777669332e-17)
    };

// EVD
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(m_cov_);
eigenValues_ = eigensolver.eigenvalues();
eigenVectors_ = eigensolver.eigenvectors();

By C++ Eigen lib, the eigen values are below:

{{x = -2.2573766836348074e-15, y = 0},
 {x = -9.709294041296323e-16, y = 0}, 
 {x = 8.9445808618868127e-17, y = 0},
 {x = 10.280850897111305, y = 0}}

And the 4x4 eigen vectors are below:

{{x = -0.3470929425161523, y = 0}, {x = -0.77713832029830621, y = -0}, {x = -0.15994536685820598, y = 0}, {x = 0.5, y = 0}}, 
{{x = -0.4597724303808105, y = 0.48181665117044714}, {x = 0.12469176895072034, y = -0.019363390950754053}, {x = -0.28597710463196591, y = 0.45689839613393701}, {x = -0.21684345356939669, y = 0.45053181535169839}},
{{x = -0.42256553981019185, y = -0.42733105533794741}, {x = 0.41437862159459787, y = 0.29068296724286291}, {x = 0.33214873805084277, y = 0.14932354148008731}, {x = 0.45697128218787925, y = 0.2029217761985285}}, 
{{x = -0.21707002501160269, y = 0.16642011333440535}, {x = -0.27240794554375036, y = 0.22298146715732753}, {x = 0.60350719516570406, y = -0.43247796708208042}, {x = -0.38102789443353835, y = 0.32375568514474662}}}

But with the same input, cuBLAS return the different results.

I use the cuBLAS EVD function below:

CHECK_CUSOLVER(cusolverDnZheevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, m_eigen_vec.data(), N, m_eigen_value.data(), d_work, lwork, devInfo));

And the eigen values and eigen vectors are below:

// Eigen Values
{-5.2180864461495171e-16,
-3.0110342183593702e-16,
2.8548936444323134e-16, 
10.497946406463264}

// Eigen Vectors
{{x = -0.38208898741712083, y = 0.41581487741664563}, {x = 0.32043709404849968, 
    y = -0.4517568232420171}, {x = 0.21717632600175682, y = -0.36577789346231687}, {x = -0.33093161501032731, 
    y = 0.28959813033042575}, {x = -0.17547383350755327, y = -0.69642001620440552}, {x = -0.10241833368805221, 
    y = -0.43952076894648784}, {x = 0.047402059701005216, y = 0.14281594546174881}, {x = 0.13099303872894871, 
    y = 0.49065012752041015}, {x = -0.32475905997549959, y = 0.077154117638887132}, {x = -0.32253254381877083, 
    y = 0.2157036870035538}, {x = -0.50261510442239055, y = 0.29617238148252628}, {x = -0.58415934115419899, 
    y = 0.23757380765099248}, {x = -0.23214840790484073, y = 0}, {x = 0.58224779741288002, y = 0}, {x = -0.67532037122395228, 
    y = 0}, {x = 0.38863480971863701, y = 0}}

You can notice that the sorted eigen values calc from Eigen Lib and from cuBLAS are the same (three ~0 values and one larger value) but the eigen vectors are totally different.

I've wrote a lot of unit tests for each step, and I'm sure that the eigen values and vectors from Eigen lib are correct, can someone tell me how to get the same answer with cuBLAS EVD?


Solution

  • Lifting and modifying the framework that I modified here, the results from cusolver seem to check the equation A * V = V * D, where A is your input matrix in the question, V is the matrix of eigenvectors returned by cusolver, and D is the diagonal matrix of eigenvalues returned by cusolver:

    # cat t247.cu
    #include <iostream>
    #include <vector>
    #include <cuComplex.h>
    #include <cusolverDn.h>
    #include <cublas_v2.h>
    #include <cuda_runtime.h>
    const int N = 4;
    cuDoubleComplex mc[N*N] = {
            make_cuDoubleComplex(2.0301223848037391, 3.4235008150792548e-17),
            make_cuDoubleComplex(1.0365842528472908, -2.1028119220635375),
            make_cuDoubleComplex(2.7516978261134084, 0.95059712173944222),
            make_cuDoubleComplex(-1.350157109070875, -1.1815219722269694),
            make_cuDoubleComplex(1.0365842528472908, 2.1028119220635375),
            make_cuDoubleComplex(2.7073859851827988, 8.7392491301242207e-17),
            make_cuDoubleComplex(0.42038828834095354, 3.3356003818061963),
            make_cuDoubleComplex(0.53443422887585779, -2.0017874620940646),
            make_cuDoubleComplex(2.7516978261134084, -0.95059712173944222),
            make_cuDoubleComplex(0.42038828834095354, -3.3356003818061963),
            make_cuDoubleComplex(4.1748595442022722, 2.0100622219496206e-17),
            make_cuDoubleComplex(-2.3832926547827333, -0.9692696339061293),
            make_cuDoubleComplex(-1.350157109070875, 1.1815219722269694),
            make_cuDoubleComplex(0.53443422887585779, 2.0017874620940646),
            make_cuDoubleComplex(-2.3832926547827333, 0.9692696339061293),
            make_cuDoubleComplex(1.5855784922744534, -1.5877902777669332e-17)
        };
    
    // Helper function to print a complex matrix
    void printMatrix(const cuDoubleComplex* A, int rows, int cols) {
        for (int i = 0; i < cols; ++i) {
            for (int j = 0; j < rows; ++j) {
                std::cout << "(" << cuCreal(A[j * cols + i]) << ", " << cuCimag(A[j * cols + i]) << ") ";
            }
            std::cout << std::endl;
        }
    }
    
    // Helper function to check if two complex numbers are approximately equal
    bool isApproxEqual(cuDoubleComplex a, cuDoubleComplex b, double tol = 1e-6) {
        return (fabs(cuCreal(a) - cuCreal(b)) < tol) && (fabs(cuCimag(a) - cuCimag(b)) < tol);
    }
    
    int main() {
        // Define a 4x4 complex covariance matrix
    #ifdef USE_TRANSPOSE
        cuDoubleComplex *A = new cuDoubleComplex[N*N];
        for (int i = 0; i < N; i++)
          for (int j = 0; j < N; j++)
            A[i*N+j] = mc[j*N+i];
    #else
        cuDoubleComplex *A = mc;
    #endif
        std::cout << "Original matrix A:" << std::endl;
        printMatrix(A, N, N);
    
        // cuBLAS and cuSOLVER setup
        cusolverDnHandle_t cusolverH = nullptr;
        cublasHandle_t cublasH = nullptr;
        cudaStream_t stream = nullptr;
        cusolverStatus_t csstat = cusolverDnCreate(&cusolverH);
        if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
        cublasStatus_t cbstat = cublasCreate(&cublasH);
        if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;
      //  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
        cudaStreamCreate(&stream);
        csstat = cusolverDnSetStream(cusolverH, stream);
        if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
        cbstat = cublasSetStream(cublasH, stream);
        if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;
    
        // Device memory for matrix A, eigenvalues, and workspace
        cuDoubleComplex* d_A = nullptr;
        double* d_W = nullptr;
        int* devInfo = nullptr;
        int lwork = 0;
        cuDoubleComplex* d_work = nullptr;
    
        cudaMalloc((void**)&d_A, sizeof(cuDoubleComplex) * N * N);
        cudaMalloc((void**)&d_W, sizeof(double) * N);
        cudaMalloc((void**)&devInfo, sizeof(int));
    
        cudaMemcpy(d_A, A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyHostToDevice);
    
        // Query working space
        csstat = cusolverDnZheevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, &lwork);
        if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
        cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork);
        // Compute EVD
        csstat = cusolverDnZheevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, d_work, lwork, devInfo);
        if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    
        // Copy results back to host
        cuDoubleComplex V[N * N];
        double W[N];
        cudaMemcpy(V, d_A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
        cudaMemcpy(W, d_W, sizeof(double) * N, cudaMemcpyDeviceToHost);
        int hinfo;
        cudaMemcpy(&hinfo, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
        if (hinfo) std::cout << "devInfo: " << hinfo << std::endl;
        std::cout << "Eigenvalues (diagonal of D):" << std::endl;
        for (int i = 0; i < N; ++i) {
            std::cout << W[i] << " ";
        }
        std::cout << std::endl;
    
        std::cout << "Eigenvectors (columns of V):" << std::endl;
        printMatrix(V, N, N);
    
        // Verify A * V = V * D using cuBLAS
        cuDoubleComplex* d_V = nullptr;
        cuDoubleComplex* d_D = nullptr;
        cuDoubleComplex* d_DV = nullptr;
        cuDoubleComplex* d_AV = nullptr;
    
        cudaMalloc((void**)&d_V, sizeof(cuDoubleComplex) * N * N);
        cudaMalloc((void**)&d_D, sizeof(cuDoubleComplex) * N * N);
        cudaMalloc((void**)&d_DV, sizeof(cuDoubleComplex) * N * N);
        cudaMalloc((void**)&d_AV, sizeof(cuDoubleComplex) * N * N);
    
        cudaMemset(d_D, 0, N*N*sizeof(cuDoubleComplex));
        cuDoubleComplex *ev = new cuDoubleComplex[N];
        for (int i = 0; i < N; i++) ev[i] = make_cuDoubleComplex(W[i], 0.0);
        for (int i = 0; i < N; i++) cudaMemcpy(d_D+(i*(N+1)), ev+i, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    
        cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
        cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);
    
        // Calculate V * D  d_A now contains the eigenvectors, it is effectively V
        cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_D, N, &beta, d_DV, N);
        if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;
    
        cuDoubleComplex* d_nA = nullptr;
        cudaMalloc(&d_nA, N*N*sizeof(cuDoubleComplex));
        cudaMemcpy(d_nA, A, N*N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
        // Calculate A * V
        cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_nA, N, d_A, N, &beta, d_AV, N);
        if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;
    
        // Copy results back to host
        cuDoubleComplex AV[N * N];
        cuDoubleComplex DV[N * N];
        cudaMemcpy(AV, d_AV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
        cudaMemcpy(DV, d_DV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    
        std::cout << "A * V:" << std::endl;
        printMatrix(AV, N, N);
    
        std::cout << "V * D:" << std::endl;
        printMatrix(DV, N, N);
    
        // Check if A * V is approximately equal to V * D
        bool equal = true;
        for (int i = 0; i < N; ++i) {
            for (int j = 0; j < N; ++j) {
                if (!isApproxEqual(AV[i * N + j], DV[i * N + j])) {
                    equal = false;
                    break;
                }
            }
        }
    
        if (equal) {
            std::cout << "Verification successful: A * V = V * D" << std::endl;
        } else {
            std::cout << "Verification failed: A * V != V * D" << std::endl;
        }
    
        // Clean up
        cudaFree(d_A);
        cudaFree(d_W);
        cudaFree(devInfo);
        cudaFree(d_work);
        cudaFree(d_V);
        cudaFree(d_DV);
        cudaFree(d_AV);
        cusolverDnDestroy(cusolverH);
        cublasDestroy(cublasH);
        cudaStreamDestroy(stream);
        return 0;
    }
    # nvcc -o t247 t247.cu -lcublas -lcusolver
    # compute-sanitizer ./t247
    ========= COMPUTE-SANITIZER
    Original matrix A:
    (2.03012, 3.4235e-17) (1.03658, 2.10281) (2.7517, -0.950597) (-1.35016, 1.18152)
    (1.03658, -2.10281) (2.70739, 8.73925e-17) (0.420388, -3.3356) (0.534434, 2.00179)
    (2.7517, 0.950597) (0.420388, 3.3356) (4.17486, 2.01006e-17) (-2.38329, 0.96927)
    (-1.35016, -1.18152) (0.534434, -2.00179) (-2.38329, -0.96927) (1.58558, -1.58779e-17)
    Eigenvalues (diagonal of D):
    -5.21809e-16 -3.01103e-16 2.85489e-16 10.4979
    Eigenvectors (columns of V):
    (-0.382089, 0.415815) (0.320437, -0.451757) (0.217176, -0.365778) (-0.330932, 0.289598)
    (-0.175474, -0.69642) (-0.102418, -0.439521) (0.0474021, 0.142816) (0.130993, 0.49065)
    (-0.324759, 0.0771541) (-0.322533, 0.215704) (-0.502615, 0.296172) (-0.584159, 0.237574)
    (-0.232148, 0) (0.582248, 0) (-0.67532, 0) (0.388635, 0)
    A * V:
    (-7.00371e-17, 2.32465e-16) (-4.94086e-17, 4.01301e-16) (-5.28972e-17, 4.03915e-17) (-3.4741, 3.04019)
    (2.63345e-16, 3.01536e-16) (3.27777e-16, 3.2118e-16) (-8.58426e-17, -4.60878e-16) (1.37516, 5.15082)
    (-1.31867e-16, 5.0219e-16) (-2.23634e-16, 2.87755e-16) (4.18528e-16, -2.85843e-16) (-6.13247, 2.49404)
    (1.07082e-16, -1.23345e-16) (1.49087e-16, -1.99645e-16) (7.87887e-17, -1.82635e-17) (4.07987, -2.48075e-16)
    V * D:
    (1.99377e-16, -2.16976e-16) (-9.64847e-17, 1.36026e-16) (6.20015e-17, -1.04426e-16) (-3.4741, 3.04019)
    (9.15638e-17, 3.63398e-16) (3.08385e-17, 1.32341e-16) (1.35328e-17, 4.07724e-17) (1.37516, 5.15082)
    (1.69462e-16, -4.02597e-17) (9.71157e-17, -6.49491e-17) (-1.43491e-16, 8.45541e-17) (-6.13247, 2.49404)
    (1.21137e-16, 0) (-1.75317e-16, 0) (-1.92797e-16, 0) (4.07987, 0)
    Verification successful: A * V = V * D
    ========= ERROR SUMMARY: 0 errors
    #