Eigen decomposition of Hermitian Matrix using CuSolver does not match the result with matlab

I am following the example of eigen decomposition from here, https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSOLVER/syevd/cusolver_syevd_example.cu

I need to do it for Hermatian complex matrix. The problem is the eigen vector is not matching at all with the result with Matlab result.

Does anyone have any idea about it why this mismatch is happening?

I have also tried cusolverdn svd method to get eigen values and vector that is giving another result.

My code is here for convenience,


#include <cstdio>
#include <cstdlib>
#include <vector>

#include <cuda_runtime.h>
#include <cusolverDn.h>

#include "cusolver_utils.h"

int N = 16;
void BuildMatrix(cuComplex* input);

void main()
{
    cusolverDnHandle_t cusolverH = NULL;
    cudaStream_t stream = NULL;
    printf("*******************\n");

    cuComplex* h_idata = (cuComplex*)malloc(sizeof(cuComplex) * N);
    cuComplex* h_eigenVector = (cuComplex*)malloc(sizeof(cuComplex) * N); // eigen vector
    float* h_eigenValue = (float*)malloc(sizeof(float) * 4);    // eigen Value
    BuildMatrix(h_idata);
    int count = 0;
    for (int i = 0; i < N / 4; i++)
    {
        for (int j = 0; j < 4; j++)
        {
            printf("%f + %f\t", h_idata[count].x, h_idata[count].y);
            count++;
        }
        printf("\n");
    }
    printf("\n*****************\n");

    /* step 1: create cusolver handle, bind a stream */
    CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));

    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));

    // step 2: reserve memory in cuda and copy input data from host to device
    cuComplex* d_idata;
    float* d_eigenValue = nullptr;
    int* d_info = nullptr;

    CUDA_CHECK(cudaMalloc((void**)&d_idata, N * sizeof(cuComplex)));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_eigenValue), N * sizeof(float)));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));

    CUDA_CHECK(cudaMemcpyAsync(d_idata, h_idata, N * sizeof(cuComplex), cudaMemcpyHostToDevice, stream));

    // step 3: query working space of syevd
    cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors.
    cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;

    int lwork = 0;            /* size of workspace */
    cuComplex* d_work = nullptr; /* device workspace*/
    const int m = 4;
    const int lda = m;

    cusolverDnCheevd_bufferSize(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, &lwork);

    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));

    // step 4: compute spectrum
    cusolverDnCheevd(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, d_work, lwork, d_info);

    CUDA_CHECK(
        cudaMemcpyAsync(h_eigenVector, d_idata, N * sizeof(cuComplex), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(
        cudaMemcpyAsync(h_eigenValue, d_eigenValue, 4 * sizeof(double), cudaMemcpyDeviceToHost, stream));

    int info = 0;
    CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    std::printf("after syevd: info = %d\n", info);
    if (0 > info)
    {
        std::printf("%d-th parameter is wrong \n", -info);
        exit(1);
    }

    count = 0;
    for (int i = 0; i < N / 4; i++)
    {
        for (int j = 0; j < 4; j++)
        {
            printf("%f + %f\t", h_eigenVector[count].x, h_eigenVector[count].y);
            count++;
        }
        printf("\n");
    }
    printf("\n");
    for (int i = 0; i < N / 4; i++)
    {
        std::cout << h_eigenValue[i] << std::endl;
    }
    printf("\n*****************\n");

    /* free resources */
    CUDA_CHECK(cudaFree(d_idata));
    CUDA_CHECK(cudaFree(d_eigenValue));
    CUDA_CHECK(cudaFree(d_info));
    CUDA_CHECK(cudaFree(d_work));

    CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaDeviceReset());
}

//0.5560 + 0.0000i - 0.4864 + 0.0548i   0.8592 + 0.2101i - 1.5374 - 0.2069i
//- 0.4864 - 0.0548i   0.4317 + 0.0000i - 0.7318 - 0.2698i   1.3255 + 0.3344i
//0.8592 - 0.2101i - 0.7318 + 0.2698i   1.4099 + 0.0000i - 2.4578 + 0.2609i
//- 1.5374 + 0.2069i   1.3255 - 0.3344i - 2.4578 - 0.2609i   4.3333 + 0.0000i
void BuildMatrix(cuComplex* input)
{
    std::vector<float> realVector = { 0.5560, -0.4864, 0.8592, -1.5374, -0.4864, 0.4317, -0.7318, 1.3255,
                                    0.8592, -0.7318, 1.4099, -2.4578, -1.5374, 1.3255, -2.4578, 4.3333 };
    std::vector<float> imagVector = { 0, -0.0548, -0.2101, 0.2069, 0.0548, 0.0000, 0.2698, -0.3344,
                                     0.2101, -0.2698, 0, -0.2609, -0.2069, 0.3344, 0.2609, 0 };

    for (int i = 0; i < N; i++)
    {
        input[i].x = realVector.at(i) * std::pow(10, 11);
        input[i].y = imagVector.at(i) * std::pow(10, 11);
    }
}

I raised this issue in their git ( https://github.com/NVIDIA/CUDALibrarySamples/issues/58), but unfortunately no one is answering.

If anyone can help me to solve this that will be very helpful.

Solution

Please follow the post for the clear answer, https://forums.developer.nvidia.com/t/eigen-decomposition-of-hermitian-matrix-using-cusolver-does-not-match-the-result-with-matlab/204157

The theory tells, A*V-lamda*V=0 should satisfy, however it might not be perfect zero. My thinking was it will very very close to zero or e-14 somethng like this. If the equation gives a value close to zero then it is acceptable.

There are different algorithms for solving eigen decomposition, like Jacobi algorithm, Cholesky factorization... The program I provided in my post uses the function cusolverDnCheevd which is based on LAPACK. LAPACK doc tells that it uses divide and conquer algorithm to solve Hermitian matrix. Here is the link, http://www.netlib.org/lapack/explore-html/d9/de3/group__complex_h_eeigen_ga6084b0819f9642f0db26257e8a3ebd42.html#ga6084b0819f9642f0db26257e8a3ebd42