c++cudasparse-matrixcusparse

function cusparseScsr2csc in cuSPARSE library return strange result


I want to test the cusparseScsr2csc which is a function used to convert a csr format matrix to a csc format matrix (or just say transpose a csr format matrix), so I write the code below to test it.

the wrapper:

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    
    // malloc space on video card and copy data
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    
    // use the API from cuSPARSE
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    
    // copy the data from device (video card) to host (CPU)
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    
    // return the answer
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

this is the CSR class:

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

and this is the code in the main:

int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
res.out();

the CSR format in the main is derived from the matrix below which I want to transpose (A <=> values, IA <=> row_ptrs, JA <=> col_inds): the matrix

the result I got (definitely wrong): result

My video card is Geforce MX150, and I use Visual Studio 15 2017 with CUDA 9.2

Full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cuda_runtime.h>
#include <cusparse.h>

#include <iostream>
#include <vector>
#include <complex>

using namespace std;

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

int main()
{
    int m = 4, n = 6, nnz = 8;
    float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
    int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
    int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

    cusparseStatus_t st;
    auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);

    if (st == CUSPARSE_STATUS_SUCCESS) {
        cout << "success" << endl;
        res.out();
    }
    return 0;
}

this is the documentation page, the function cusparseScsr2csc is located in ch. 9. And I find the text below, it says the function executes asynchronously, maybe this is the problem, but I still don't know how to deal with it.

piece

Edit:

I tried the solution mentioned by paleonix (add cudaDeviceSynchronize() right after the cusparseScsr2csc(...)), but still got the exact same wrong answer.


Solution

  • The main problem is here:

    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    

    That should be:

    cudaMemcpy(csr_col_inds, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    

    A few other notes: