I am need to perform transpose of a matrix(CSR) using cuSPARSE, but get “internal error”. I write my code referring to How to transpose a sparse matrix in cuSparse? and https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2. To make it more clearly, I am trying to perform transpose by convert the matrix from format csr to format csc.
I am running on Nvidia GeForce GTX 1080, with driver cuda_11.1.0. I am using Windows 10.
The following is my codes. You can download the folder from https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/sparse2dense, and replace the sparse2dense_example.c with my codes. Then configure and make using CMake, in this way maybe you can reproduce my problems.
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSparseToDense
#include <stdio.h> // printf
#include <stdlib.h> // EXIT_FAILURE
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
printf("CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \
__LINE__, cusparseGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
int main(void) {
// CUSPARSE APIs
cusparseHandle_t handle = NULL;
cusparseStatus_t status = (cusparseCreate(&handle));
if (status != CUSPARSE_STATUS_SUCCESS) {
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, cusparseGetErrorString(status), status);
}
// Initialize matrix A
// this matrix is the same as https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/sparse2dense/sparse2dense_example.c
int num_rows = 5;
int num_cols = 4;
int nnz = 11;
int h_csr_offsets[] = { 0, 3, 4, 7, 9, 11 };
int h_csr_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3, 1, 2 };
float h_csr_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
7.0f, 8.0f, 9.0f, 10.0f, 11.0f };
// Device memory management
int* d_csr_offsets, * d_csr_columns;
float* d_csr_values;
CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets, (num_rows + 1) * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_columns, nnz * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_values, nnz * sizeof(float)))
CHECK_CUDA(cudaMemcpy(d_csr_offsets, h_csr_offsets, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(d_csr_columns, h_csr_columns, nnz * sizeof(int), cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(d_csr_values, h_csr_values, nnz * sizeof(float), cudaMemcpyHostToDevice))
// Memory allocation of transpose A
int* d_csr_offsets_AT, * d_csr_columns_AT;
float* d_csr_values_AT;
//first allocate memory to ATT
CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets_AT, (num_cols + 1) * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_columns_AT, nnz * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_values_AT, nnz * sizeof(float)))
size_t buffer_temp_size;
cusparseCsr2cscEx2_bufferSize(
handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_temp_size);
void* buffer_temp = NULL;
printf("buffer_temp_size is %zd\n", buffer_temp_size);
CHECK_CUDA(cudaMalloc(&buffer_temp, buffer_temp_size))
CHECK_CUSPARSE(cusparseCsr2cscEx2(handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, buffer_temp))
}
The error is due to the fact that you are passing pointers to host data, to a routine that intends to work on device data:
cusparseCsr2cscEx2_bufferSize(
handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
^ ^ ^
and
CHECK_CUSPARSE(cusparseCsr2cscEx2(handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
^ ^ ^
When I change those instances to your allocated device data:
d_csr_values, d_csr_offsets, d_csr_columns
the "internal error" that you are asking about goes away, according to my testing.