Eigen Vectors mismatch by cuBLAS and Eigen lib

I want to do the EVD wth this 4x4 covariance matrix:

cuDoubleComplex m_cov_[16] = {
        make_cuDoubleComplex(2.0301223848037391, 3.4235008150792548e-17),
        make_cuDoubleComplex(1.0365842528472908, -2.1028119220635375),
        make_cuDoubleComplex(2.7516978261134084, 0.95059712173944222),
        make_cuDoubleComplex(-1.350157109070875, -1.1815219722269694),
        make_cuDoubleComplex(1.0365842528472908, 2.1028119220635375),
        make_cuDoubleComplex(2.7073859851827988, 8.7392491301242207e-17),
        make_cuDoubleComplex(0.42038828834095354, 3.3356003818061963),
        make_cuDoubleComplex(0.53443422887585779, -2.0017874620940646),
        make_cuDoubleComplex(2.7516978261134084, -0.95059712173944222),
        make_cuDoubleComplex(0.42038828834095354, -3.3356003818061963),
        make_cuDoubleComplex(4.1748595442022722, 2.0100622219496206e-17),
        make_cuDoubleComplex(-2.3832926547827333, -0.9692696339061293),
        make_cuDoubleComplex(-1.350157109070875, 1.1815219722269694),
        make_cuDoubleComplex(0.53443422887585779, 2.0017874620940646),
        make_cuDoubleComplex(-2.3832926547827333, 0.9692696339061293),
        make_cuDoubleComplex(1.5855784922744534, -1.5877902777669332e-17)
    };

// EVD
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(m_cov_);
eigenValues_ = eigensolver.eigenvalues();
eigenVectors_ = eigensolver.eigenvectors();

By C++ Eigen lib, the eigen values are below:

{{x = -2.2573766836348074e-15, y = 0},
 {x = -9.709294041296323e-16, y = 0}, 
 {x = 8.9445808618868127e-17, y = 0},
 {x = 10.280850897111305, y = 0}}

And the 4x4 eigen vectors are below:

{{x = -0.3470929425161523, y = 0}, {x = -0.77713832029830621, y = -0}, {x = -0.15994536685820598, y = 0}, {x = 0.5, y = 0}}, 
{{x = -0.4597724303808105, y = 0.48181665117044714}, {x = 0.12469176895072034, y = -0.019363390950754053}, {x = -0.28597710463196591, y = 0.45689839613393701}, {x = -0.21684345356939669, y = 0.45053181535169839}},
{{x = -0.42256553981019185, y = -0.42733105533794741}, {x = 0.41437862159459787, y = 0.29068296724286291}, {x = 0.33214873805084277, y = 0.14932354148008731}, {x = 0.45697128218787925, y = 0.2029217761985285}}, 
{{x = -0.21707002501160269, y = 0.16642011333440535}, {x = -0.27240794554375036, y = 0.22298146715732753}, {x = 0.60350719516570406, y = -0.43247796708208042}, {x = -0.38102789443353835, y = 0.32375568514474662}}}

But with the same input, cuBLAS return the different results.

I use the cuBLAS EVD function below:

CHECK_CUSOLVER(cusolverDnZheevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, m_eigen_vec.data(), N, m_eigen_value.data(), d_work, lwork, devInfo));

And the eigen values and eigen vectors are below:

// Eigen Values
{-5.2180864461495171e-16,
-3.0110342183593702e-16,
2.8548936444323134e-16, 
10.497946406463264}

// Eigen Vectors
{{x = -0.38208898741712083, y = 0.41581487741664563}, {x = 0.32043709404849968, 
    y = -0.4517568232420171}, {x = 0.21717632600175682, y = -0.36577789346231687}, {x = -0.33093161501032731, 
    y = 0.28959813033042575}, {x = -0.17547383350755327, y = -0.69642001620440552}, {x = -0.10241833368805221, 
    y = -0.43952076894648784}, {x = 0.047402059701005216, y = 0.14281594546174881}, {x = 0.13099303872894871, 
    y = 0.49065012752041015}, {x = -0.32475905997549959, y = 0.077154117638887132}, {x = -0.32253254381877083, 
    y = 0.2157036870035538}, {x = -0.50261510442239055, y = 0.29617238148252628}, {x = -0.58415934115419899, 
    y = 0.23757380765099248}, {x = -0.23214840790484073, y = 0}, {x = 0.58224779741288002, y = 0}, {x = -0.67532037122395228, 
    y = 0}, {x = 0.38863480971863701, y = 0}}

You can notice that the sorted eigen values calc from Eigen Lib and from cuBLAS are the same (three ~0 values and one larger value) but the eigen vectors are totally different.

I've wrote a lot of unit tests for each step, and I'm sure that the eigen values and vectors from Eigen lib are correct, can someone tell me how to get the same answer with cuBLAS EVD?

Solution

Lifting and modifying the framework that I modified here, the results from cusolver seem to check the equation A * V = V * D, where A is your input matrix in the question, V is the matrix of eigenvectors returned by cusolver, and D is the diagonal matrix of eigenvalues returned by cusolver:

# cat t247.cu
#include <iostream>
#include <vector>
#include <cuComplex.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
const int N = 4;
cuDoubleComplex mc[N*N] = {
        make_cuDoubleComplex(2.0301223848037391, 3.4235008150792548e-17),
        make_cuDoubleComplex(1.0365842528472908, -2.1028119220635375),
        make_cuDoubleComplex(2.7516978261134084, 0.95059712173944222),
        make_cuDoubleComplex(-1.350157109070875, -1.1815219722269694),
        make_cuDoubleComplex(1.0365842528472908, 2.1028119220635375),
        make_cuDoubleComplex(2.7073859851827988, 8.7392491301242207e-17),
        make_cuDoubleComplex(0.42038828834095354, 3.3356003818061963),
        make_cuDoubleComplex(0.53443422887585779, -2.0017874620940646),
        make_cuDoubleComplex(2.7516978261134084, -0.95059712173944222),
        make_cuDoubleComplex(0.42038828834095354, -3.3356003818061963),
        make_cuDoubleComplex(4.1748595442022722, 2.0100622219496206e-17),
        make_cuDoubleComplex(-2.3832926547827333, -0.9692696339061293),
        make_cuDoubleComplex(-1.350157109070875, 1.1815219722269694),
        make_cuDoubleComplex(0.53443422887585779, 2.0017874620940646),
        make_cuDoubleComplex(-2.3832926547827333, 0.9692696339061293),
        make_cuDoubleComplex(1.5855784922744534, -1.5877902777669332e-17)
    };

// Helper function to print a complex matrix
void printMatrix(const cuDoubleComplex* A, int rows, int cols) {
    for (int i = 0; i < cols; ++i) {
        for (int j = 0; j < rows; ++j) {
            std::cout << "(" << cuCreal(A[j * cols + i]) << ", " << cuCimag(A[j * cols + i]) << ") ";
        }
        std::cout << std::endl;
    }
}

// Helper function to check if two complex numbers are approximately equal
bool isApproxEqual(cuDoubleComplex a, cuDoubleComplex b, double tol = 1e-6) {
    return (fabs(cuCreal(a) - cuCreal(b)) < tol) && (fabs(cuCimag(a) - cuCimag(b)) < tol);
}

int main() {
    // Define a 4x4 complex covariance matrix
#ifdef USE_TRANSPOSE
    cuDoubleComplex *A = new cuDoubleComplex[N*N];
    for (int i = 0; i < N; i++)
      for (int j = 0; j < N; j++)
        A[i*N+j] = mc[j*N+i];
#else
    cuDoubleComplex *A = mc;
#endif
    std::cout << "Original matrix A:" << std::endl;
    printMatrix(A, N, N);

    // cuBLAS and cuSOLVER setup
    cusolverDnHandle_t cusolverH = nullptr;
    cublasHandle_t cublasH = nullptr;
    cudaStream_t stream = nullptr;
    cusolverStatus_t csstat = cusolverDnCreate(&cusolverH);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cublasStatus_t cbstat = cublasCreate(&cublasH);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;
  //  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaStreamCreate(&stream);
    csstat = cusolverDnSetStream(cusolverH, stream);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cbstat = cublasSetStream(cublasH, stream);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    // Device memory for matrix A, eigenvalues, and workspace
    cuDoubleComplex* d_A = nullptr;
    double* d_W = nullptr;
    int* devInfo = nullptr;
    int lwork = 0;
    cuDoubleComplex* d_work = nullptr;

    cudaMalloc((void**)&d_A, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_W, sizeof(double) * N);
    cudaMalloc((void**)&devInfo, sizeof(int));

    cudaMemcpy(d_A, A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyHostToDevice);

    // Query working space
    csstat = cusolverDnZheevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, &lwork);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork);
    // Compute EVD
    csstat = cusolverDnZheevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, d_work, lwork, devInfo);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;

    // Copy results back to host
    cuDoubleComplex V[N * N];
    double W[N];
    cudaMemcpy(V, d_A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(W, d_W, sizeof(double) * N, cudaMemcpyDeviceToHost);
    int hinfo;
    cudaMemcpy(&hinfo, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
    if (hinfo) std::cout << "devInfo: " << hinfo << std::endl;
    std::cout << "Eigenvalues (diagonal of D):" << std::endl;
    for (int i = 0; i < N; ++i) {
        std::cout << W[i] << " ";
    }
    std::cout << std::endl;

    std::cout << "Eigenvectors (columns of V):" << std::endl;
    printMatrix(V, N, N);

    // Verify A * V = V * D using cuBLAS
    cuDoubleComplex* d_V = nullptr;
    cuDoubleComplex* d_D = nullptr;
    cuDoubleComplex* d_DV = nullptr;
    cuDoubleComplex* d_AV = nullptr;

    cudaMalloc((void**)&d_V, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_D, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_DV, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_AV, sizeof(cuDoubleComplex) * N * N);

    cudaMemset(d_D, 0, N*N*sizeof(cuDoubleComplex));
    cuDoubleComplex *ev = new cuDoubleComplex[N];
    for (int i = 0; i < N; i++) ev[i] = make_cuDoubleComplex(W[i], 0.0);
    for (int i = 0; i < N; i++) cudaMemcpy(d_D+(i*(N+1)), ev+i, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);

    cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
    cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);

    // Calculate V * D  d_A now contains the eigenvectors, it is effectively V
    cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_D, N, &beta, d_DV, N);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    cuDoubleComplex* d_nA = nullptr;
    cudaMalloc(&d_nA, N*N*sizeof(cuDoubleComplex));
    cudaMemcpy(d_nA, A, N*N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    // Calculate A * V
    cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_nA, N, d_A, N, &beta, d_AV, N);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    // Copy results back to host
    cuDoubleComplex AV[N * N];
    cuDoubleComplex DV[N * N];
    cudaMemcpy(AV, d_AV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(DV, d_DV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);

    std::cout << "A * V:" << std::endl;
    printMatrix(AV, N, N);

    std::cout << "V * D:" << std::endl;
    printMatrix(DV, N, N);

    // Check if A * V is approximately equal to V * D
    bool equal = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (!isApproxEqual(AV[i * N + j], DV[i * N + j])) {
                equal = false;
                break;
            }
        }
    }

    if (equal) {
        std::cout << "Verification successful: A * V = V * D" << std::endl;
    } else {
        std::cout << "Verification failed: A * V != V * D" << std::endl;
    }

    // Clean up
    cudaFree(d_A);
    cudaFree(d_W);
    cudaFree(devInfo);
    cudaFree(d_work);
    cudaFree(d_V);
    cudaFree(d_DV);
    cudaFree(d_AV);
    cusolverDnDestroy(cusolverH);
    cublasDestroy(cublasH);
    cudaStreamDestroy(stream);
    return 0;
}
# nvcc -o t247 t247.cu -lcublas -lcusolver
# compute-sanitizer ./t247
========= COMPUTE-SANITIZER
Original matrix A:
(2.03012, 3.4235e-17) (1.03658, 2.10281) (2.7517, -0.950597) (-1.35016, 1.18152)
(1.03658, -2.10281) (2.70739, 8.73925e-17) (0.420388, -3.3356) (0.534434, 2.00179)
(2.7517, 0.950597) (0.420388, 3.3356) (4.17486, 2.01006e-17) (-2.38329, 0.96927)
(-1.35016, -1.18152) (0.534434, -2.00179) (-2.38329, -0.96927) (1.58558, -1.58779e-17)
Eigenvalues (diagonal of D):
-5.21809e-16 -3.01103e-16 2.85489e-16 10.4979
Eigenvectors (columns of V):
(-0.382089, 0.415815) (0.320437, -0.451757) (0.217176, -0.365778) (-0.330932, 0.289598)
(-0.175474, -0.69642) (-0.102418, -0.439521) (0.0474021, 0.142816) (0.130993, 0.49065)
(-0.324759, 0.0771541) (-0.322533, 0.215704) (-0.502615, 0.296172) (-0.584159, 0.237574)
(-0.232148, 0) (0.582248, 0) (-0.67532, 0) (0.388635, 0)
A * V:
(-7.00371e-17, 2.32465e-16) (-4.94086e-17, 4.01301e-16) (-5.28972e-17, 4.03915e-17) (-3.4741, 3.04019)
(2.63345e-16, 3.01536e-16) (3.27777e-16, 3.2118e-16) (-8.58426e-17, -4.60878e-16) (1.37516, 5.15082)
(-1.31867e-16, 5.0219e-16) (-2.23634e-16, 2.87755e-16) (4.18528e-16, -2.85843e-16) (-6.13247, 2.49404)
(1.07082e-16, -1.23345e-16) (1.49087e-16, -1.99645e-16) (7.87887e-17, -1.82635e-17) (4.07987, -2.48075e-16)
V * D:
(1.99377e-16, -2.16976e-16) (-9.64847e-17, 1.36026e-16) (6.20015e-17, -1.04426e-16) (-3.4741, 3.04019)
(9.15638e-17, 3.63398e-16) (3.08385e-17, 1.32341e-16) (1.35328e-17, 4.07724e-17) (1.37516, 5.15082)
(1.69462e-16, -4.02597e-17) (9.71157e-17, -6.49491e-17) (-1.43491e-16, 8.45541e-17) (-6.13247, 2.49404)
(1.21137e-16, 0) (-1.75317e-16, 0) (-1.92797e-16, 0) (4.07987, 0)
Verification successful: A * V = V * D
========= ERROR SUMMARY: 0 errors
#