c++cudacublas

CUBLAS matrix multiplication with row-major data without transpose


I am currently trying to implement matrix multiplication using CUBLAS on my GPU.

It works fine for square matrices and for certain sizes of inputs, but for others the last line is not returned (and contains 0 as it is the way I implemented it).

I assume it is a problem with the allocation or the syntax of cublasSgemm, but I could not find where it was.

N.B. : If you are not familiar with CUBLAS: it is column-majored, which is why it looks like the operation are performed the other way.

Any help would be appreciated.


Source code for multiplication

Note that the gpuErrchk and cublasErrchk are of course irrelevant here.

#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <vector>


extern void gpuAssert(cudaError_t code, const char *file, int line);
void cublasAssert(cublasStatus_t code, const char *file, int line);

// See below
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }


std::vector<float> CUDA_mult_MAT(const std::vector<float> &data_1 , const uint64_t data_1_rows, const uint64_t data_1_columns,
                                 const std::vector<float> &data_2 , const uint64_t data_2_rows, const uint64_t data_2_columns)
{
    cublasHandle_t handle;

    cublasErrchk(cublasCreate(&handle));

    std::vector<float> result(data_1_rows * data_2_columns); //Vector holding the result of the multiplication

    /*----------------------------------------------------------------------------------------------*/

    float* GPU_data_1 = nullptr;
    gpuErrchk(cudaMalloc((void**)&GPU_data_1 , data_1.size()*sizeof(float))); //Allocate memory on the GPU
    gpuErrchk(cudaMemcpy(GPU_data_1, data_1.data(), data_1.size()*sizeof(float), cudaMemcpyHostToDevice)); //Copy data from data_1 to GPU_data_1

    float* GPU_data_2 = nullptr;
    gpuErrchk(cudaMalloc((void**)&GPU_data_2 ,data_2.size()*sizeof(float))); //Allocate memory on the GPU
    gpuErrchk(cudaMemcpy(GPU_data_2, data_2.data(), data_2.size()*sizeof(float), cudaMemcpyHostToDevice));//Copy data from data_2 to GPU_data_2

    float* GPU_result = nullptr;
    gpuErrchk(cudaMalloc((void**)&GPU_result , result.size()*sizeof(float))); //Allocate memory on the GPU

    /*----------------------------------------------------------------------------------------------*/


    const float alpha = 1.f; 
    const float beta  = 0.f;

    cublasErrchk(
               cublasSgemm(handle , CUBLAS_OP_N , CUBLAS_OP_N,
                           data_2_columns , data_2_rows ,data_1_columns,
                           &alpha , GPU_data_2 , data_2_columns,
                           GPU_data_1 , data_1_columns,
                           &beta , GPU_result , data_1_rows)
             ); //Perform multiplication 



    gpuErrchk(cudaMemcpy(result.data() , GPU_result , result.size() * sizeof(float) , cudaMemcpyDeviceToHost)); //Copy back to the vector 'result'

    gpuErrchk(cudaFree(GPU_data_1)); //Free GPU memory
    gpuErrchk(cudaFree(GPU_data_2)); //Free GPU memory
    gpuErrchk(cudaFree(GPU_result)); //Free GPU memory

    cublasErrchk(cublasDestroy_v2(handle)); 

    return result;
}


Test inputs

#include <iostream>

#include <vector>

int main()
{
    const auto r1 = CUDA_mult_MAT({1 , 2 , 3 , 4 , 5 , 6} , 2 , 3 ,
                                  {7 , 8 , 9 , 10 , 11 , 12} , 3 , 2);
    /*
    Product:
              7  8
    1 2 3  x  9  10
    4 5 6     11 12
    
    */

    for(const auto& value: r1){std::cout << value << " " ;}
    std::cout << std::endl;

    const auto r2 = CUDA_mult_MAT({7 , 8 , 9 , 10 , 11 , 12} , 3 , 2 ,
                                  {1 , 2 , 3 , 4 , 5 , 6} , 2 , 3);
    /*
    Product:
    7  8   
    9  10  x  1  2  3
    11 12     4  5  6
    */


    for(const auto& value: r2){std::cout << value << " " ;}
    std::cout << std::endl;

    return 0;
}

// Shamelessly stolen from https://stackoverflow.com/a/14038590
void gpuAssert(cudaError_t code, const char *file, int line)
{

    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        exit(code);
    }
}

void cublasAssert(cublasStatus_t code, const char *file, int line)
{
    if(code != CUBLAS_STATUS_SUCCESS)
    {
        std::cerr <<  "CUBLAS error.\nError code: ";

        switch(code)
        {
            case CUBLAS_STATUS_SUCCESS:{std::cerr << "CUBLAS_STATUS_SUCCESS."; break;}

            case CUBLAS_STATUS_NOT_INITIALIZED:{std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED."; break;}

            case CUBLAS_STATUS_ALLOC_FAILED:{std::cerr << "CUBLAS_STATUS_ALLOC_FAILED."; break;}

            case CUBLAS_STATUS_INVALID_VALUE:{std::cerr << "CUBLAS_STATUS_INVALID_VALUE."; break;}

            case CUBLAS_STATUS_ARCH_MISMATCH:{std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH."; break;}

            case CUBLAS_STATUS_MAPPING_ERROR:{std::cerr << "CUBLAS_STATUS_MAPPING_ERROR."; break;}

            case CUBLAS_STATUS_EXECUTION_FAILED:{std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED."; break;}

            case CUBLAS_STATUS_INTERNAL_ERROR:{std::cerr << "CUBLAS_STATUS_INTERNAL_ERROR."; break;}

            case CUBLAS_STATUS_NOT_SUPPORTED:{std::cerr << "CUBLAS_STATUS_NOT_SUPPORTED."; break;}

            case CUBLAS_STATUS_LICENSE_ERROR:{std::cerr << "CUBLAS_STATUS_LICENSE_ERROR."; break;}

            default:{std::cerr << "<unknown>."; break;}
        }

        std::cerr << "\nFile: "<< file << "\n";
        std::cerr << "Line: "<< line <<std::endl;

        exit(EXIT_FAILURE);
    }
}

Outputs

Printed by the program:

58 64 139 154 
39 54 69 49 68 87 0 0 0
                  ^~~~~~~

Expected:

58 64 139 154 
39 54 69 49 68 87 59 82 105
                  ^~~~~~~

Solution

  • We can observe a problem with your CUBLAS usage in different ways.

    First, studying the CUBLAS Sgemm documentation, we see that the 3 parameters m,n,k appear, in that order immediately after the transpose specifiers:

    cublasStatus_t cublasSgemm(cublasHandle_t handle,
                           cublasOperation_t transa, cublasOperation_t transb,
                           int m, int n, int k, 
                               ^      ^      ^
    

    We also observe that the matrix dimensions are given by:

    A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n ,

    So the first input matrix is of dimension m x k the second input matrix is of dimension k x n, and the output matrix is of dimension m x n

    Let's just focus on the output matrix for a moment. Given that its dimensions are specified using m and n parameters, it could not possibly be correct (lets say in the non-square case) to pass only data_2 dimensions for those:

               cublasSgemm(handle , CUBLAS_OP_N , CUBLAS_OP_N,
                           data_2_columns , data_2_rows ,data_1_columns,
                           ^^^^^^^^^^^^^^   ^^^^^^^^^^^
    

    Second, from the standpoint of error checking, you can quickly get an estimate that something is wrong with your CUBLAS call by running your code with cuda-memcheck (or for newer GPUs, compute-sanitizer). The very first error reported is as follows:

    $ cuda-memcheck ./t23
    ========= CUDA-MEMCHECK
    ========= Invalid __global__ read of size 4
    =========     at 0x000006f0 in void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=3, int=4, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>)
    =========     by thread (64,0,0) in block (0,0,0)
    =========     Address 0x7f9c30a2061c is out of bounds
    =========     Device Frame:void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=3, int=4, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>) (void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=3, int=4, bool=0, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>) : 0x6f0)
    =========     Saved host backtrace up to driver entry point at kernel launch time
    =========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2b8) [0x1e5cc8]
    =========     Host Frame:/usr/local/cuda/lib64/libcublasLt.so.11 [0x1063c8b]
    =========     Host Frame:/usr/local/cuda/lib64/libcublasLt.so.11 [0x10a9965]
    =========     Host Frame:/usr/local/cuda/lib64/libcublasLt.so.11 [0x6bfacc]
    =========     Host Frame:/usr/local/cuda/lib64/libcublasLt.so.11 [0x5fc7af]
    =========     Host Frame:/usr/local/cuda/lib64/libcublasLt.so.11 [0x436c35]
    =========     Host Frame:/usr/local/cuda/lib64/libcublasLt.so.11 (cublasLtMatmul + 0x60f) [0x43484f]
    =========     Host Frame:/usr/local/cuda/lib64/libcublas.so.11 [0x9ef6db]
    =========     Host Frame:/usr/local/cuda/lib64/libcublas.so.11 [0x50e4f0]
    =========     Host Frame:/usr/local/cuda/lib64/libcublas.so.11 (cublasSgemm_v2 + 0x1ee) [0x50f29e]
    =========     Host Frame:./t23 [0x7986]
    =========     Host Frame:./t23 [0x7b4c]
    =========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
    =========     Host Frame:./t23 [0x744a]
    =========
    

    Of course one possible solution would be to transpose the input matrices, so they are in column major order, and CUBLAS provides options with Sgemm to do that (see above). However it appears to me that what you are trying to do is do C-style row-major multiplication without transposing the input arrays. There is an article here which gives a description how to do that.

    When I apply that heuristic to your cublasSgemm() call, I get this:

               cublasSgemm(handle , CUBLAS_OP_N , CUBLAS_OP_N,
                           data_2_columns , data_1_rows ,data_1_columns,
                           &alpha , GPU_data_2 , data_2_columns,
                           GPU_data_1 , data_1_columns,
                           &beta , GPU_result , data_2_columns)
    

    When I compile and run your code with those changes, I get this:

    $ cuda-memcheck ./t23
    ========= CUDA-MEMCHECK
    58 64 139 154
    39 54 69 49 68 87 59 82 105
    ========= ERROR SUMMARY: 0 errors
    

    This post indicates how the above can be modified to handle the case of C=A*B' instead of C=A*B

    Here is a pictorial copy of the previously linked web page:

    enter image description here