c++cudacublas

CUBLAS matrix multiplication with row-major data


I read some related posts here, and success using do the row majored matrixes multiplication with cuBLAS:

A*B (column majored) = B*A (row majored)

I write a wrapper to do this so that I can pass row majored matrixes A and B and return results accordingly.

cublasStatus_t gemm_float(cuDataF &out, cublasHandle_t &handle, const cuDataF &in1, int row1, int col1, cublasOperation_t trans1, const cuDataF &in2, int row2, int col2, cublasOperation_t trans2, float alpha, float beta) {
        /*if (trans1 == CUBLAS_OP_N && trans2 == CUBLAS_OP_T) {
            
        }*/

        return cublasSgemm(handle, trans1, trans2,
                            col2, row1, col1,
                            &alpha,
                            in2.data(), col2,
                            in1.data(), col1,
                            &beta,
                            out.data(), col2);
  }

But now my question is, if I want A*transpose(B) in row majored, how to do this?

I tried to derivation mathematics by hand but the answer is wrong.

Like:

A(2x3) = {1,2,3,4,5,6} (row majored) 
B(2x3) = {7,8,9,10,11,12} (row majored)
A*transpose(B)?

Solution

  • I'll use the code here as a starting point, since you mentioned that you had already observed that, and the order of arguments in your proposed cublas call lines up with that. What changes are needed?

    So your A matrix and B matrix, interpreted as 2x3, are:

      A             B
    1 2 3         7  8  9
    4 5 6        10  11 12
    

    Therefore transpose(B)=B' is:

      B'
    7  10
    8  11
    9  12
    

    And the product C=AxB' is:

       C
    50  68
    122 167
    

    Likewise for a similar 3x2 test case, we get:

      C          =     A   x    B'
    23  53  83        7  8     1  3  5
    29  67  105       9  10    2  4  6
    35  81  127       11 12
    

    This seems to work:

    # cat t232.cu
    #include <cuda.h>
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    #include <iostream>
    #include <vector>
    
    
    extern void gpuAssert(cudaError_t code, const char *file, int line);
    void cublasAssert(cublasStatus_t code, const char *file, int line);
    
    // See below
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    #define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
    
    
    std::vector<float> CUDA_mult_MAT_T(const std::vector<float> &data_1 , const uint64_t data_1_rows, const uint64_t data_1_columns,
                                     const std::vector<float> &data_2 , const uint64_t data_2_rows, const uint64_t data_2_columns)
    {
        cublasHandle_t handle;
    
        cublasErrchk(cublasCreate(&handle));
    
        std::vector<float> result(data_1_rows * data_2_columns); //Vector holding the result of the multiplication
    
        /*----------------------------------------------------------------------------------------------*/
    
        float* GPU_data_1 = nullptr;
        gpuErrchk(cudaMalloc((void**)&GPU_data_1 , data_1.size()*sizeof(float))); //Allocate memory on the GPU
        gpuErrchk(cudaMemcpy(GPU_data_1, data_1.data(), data_1.size()*sizeof(float), cudaMemcpyHostToDevice)); //Copy data from data_1 to GPU_data_1
    
        float* GPU_data_2 = nullptr;
        gpuErrchk(cudaMalloc((void**)&GPU_data_2 ,data_2.size()*sizeof(float))); //Allocate memory on the GPU
        gpuErrchk(cudaMemcpy(GPU_data_2, data_2.data(), data_2.size()*sizeof(float), cudaMemcpyHostToDevice));//Copy data from data_2 to GPU_data_2
    
        float* GPU_result = nullptr;
        gpuErrchk(cudaMalloc((void**)&GPU_result , result.size()*sizeof(float))); //Allocate memory on the GPU
    
        /*----------------------------------------------------------------------------------------------*/
    
    
        const float alpha = 1.f;
        const float beta  = 0.f;
    
        cublasErrchk(
                   cublasSgemm(handle , CUBLAS_OP_T, CUBLAS_OP_N,
                           data_2_columns , data_1_rows ,data_1_columns,
                           &alpha , GPU_data_2 , data_2_rows,
                           GPU_data_1 , data_1_columns,
                           &beta , GPU_result , data_2_columns)
    
                   ); //Perform multiplication
    
    
    
        gpuErrchk(cudaMemcpy(result.data() , GPU_result , result.size() * sizeof(float) , cudaMemcpyDeviceToHost)); //Copy back to the vector 'result'
    
        gpuErrchk(cudaFree(GPU_data_1)); //Free GPU memory
        gpuErrchk(cudaFree(GPU_data_2)); //Free GPU memory
        gpuErrchk(cudaFree(GPU_result)); //Free GPU memory
    
        cublasErrchk(cublasDestroy_v2(handle));
    
        return result;
    }
    int main()
    {
        const auto r1 = CUDA_mult_MAT_T({1 , 2 , 3 , 4 , 5 , 6} , 2 , 3 ,
                                      {7 , 8 , 9 , 10 , 11 , 12} , 3 , 2);
        /*
        Product:
                  7  8
        1 2 3  x  9  10
        4 5 6     11 12
    
        */
    
        for(const auto& value: r1){std::cout << value << " " ;}
        std::cout << std::endl;
    
        const auto r2 = CUDA_mult_MAT_T({7 , 8 , 9 , 10 , 11 , 12} , 3 , 2 ,
                                      {1 , 2 , 3 , 4 , 5 , 6} , 2 , 3);
        /*
        Product:
        7  8
        9  10  x  1  2  3
        11 12     4  5  6
        */
    
    
        for(const auto& value: r2){std::cout << value << " " ;}
        std::cout << std::endl;
    
        return 0;
    }
    
    // Shamelessly stolen from https://stackoverflow.com/a/14038590
    void gpuAssert(cudaError_t code, const char *file, int line)
    {
    
        if (code != cudaSuccess)
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            exit(code);
        }
    }
    
    void cublasAssert(cublasStatus_t code, const char *file, int line)
    {
        if(code != CUBLAS_STATUS_SUCCESS)
        {
            std::cerr <<  "CUBLAS error.\nError code: ";
    
            switch(code)
            {
                case CUBLAS_STATUS_SUCCESS:{std::cerr << "CUBLAS_STATUS_SUCCESS."; break;}
    
                case CUBLAS_STATUS_NOT_INITIALIZED:{std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED."; break;}
    
                case CUBLAS_STATUS_ALLOC_FAILED:{std::cerr << "CUBLAS_STATUS_ALLOC_FAILED."; break;}
    
                case CUBLAS_STATUS_INVALID_VALUE:{std::cerr << "CUBLAS_STATUS_INVALID_VALUE."; break;}
    
                case CUBLAS_STATUS_ARCH_MISMATCH:{std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH."; break;}
    
                case CUBLAS_STATUS_MAPPING_ERROR:{std::cerr << "CUBLAS_STATUS_MAPPING_ERROR."; break;}
    
                case CUBLAS_STATUS_EXECUTION_FAILED:{std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED."; break;}
    
                case CUBLAS_STATUS_INTERNAL_ERROR:{std::cerr << "CUBLAS_STATUS_INTERNAL_ERROR."; break;}
    
                case CUBLAS_STATUS_NOT_SUPPORTED:{std::cerr << "CUBLAS_STATUS_NOT_SUPPORTED."; break;}
    
                case CUBLAS_STATUS_LICENSE_ERROR:{std::cerr << "CUBLAS_STATUS_LICENSE_ERROR."; break;}
    
                default:{std::cerr << "<unknown>."; break;}
            }
    
            std::cerr << "\nFile: "<< file << "\n";
            std::cerr << "Line: "<< line <<std::endl;
    
            exit(EXIT_FAILURE);
        }
    }
    # nvcc -o t232 t232.cu -lcublas
    # compute-sanitizer ./t232
    ========= COMPUTE-SANITIZER
    50 68 122 167
    23 53 83 29 67 105 35 81 127
    ========= ERROR SUMMARY: 0 errors
    #
    

    Note that I left the "input" description of both matrices to be consistent with the way they will actually be calculated. In the calculation routine, the only changes I made were to change the first transpose specifier from "OP_N" to "OP_T" (because the 2nd input matrix shows up as the first calculation matrix in this row-major rubric/formulation) and I changed the leading dimension of the first calculation matrix (the 2nd input matrix - the one that gets transpose) from data_2_columns to data_2_rows.

    That is probably the fewest changes from the prototype given in the question. However we could dispense with all that, and recognize that transpose(B) when B is row-major is equivalent to viewing B as column-major. Likewise if A is row-major, then transpose(A) makes it equivalent to column-major. Therefore we could use an "ordinary" CUBLAS formulation, but specifying OP_T on A (and use B as is, and do not reverse the order of B and A as is indicated in the question.)