c++cudagpumatrix-multiplicationcublas

CUBLAS_STATUS_INVALID_VALUE


I am doing so-called "tensor contraction" using the cublasSgemmStridedBatched API. I have tensor A of shape 60000*20*9 and tensor B of shape 9*32, both of them are row-major. By definition, C = A * B should give me the the result tensor C of shape 60000*20*32. The code that i write is following:


    int batch_count = 60000;
    int M = 20;
    int K = 9;
    int N = 32;

    cublasHandle_t handle;
    cublasCreate(&handle);
    float alpha = 1.0;
    float beta = 0.0;
    int strideA = 20 * 9;
    int strideB = 0;
    int strideC = 20 * 32;
    // A(60000 * 20 * 9) * B(9 * 32) = C(60000 * 20 * 32)
    cublasStatus_t ret = cublasSgemmStridedBatched(
                              handle, 
                              CUBLAS_OP_N, //transposed, since in row-major
                              CUBLAS_OP_N, //transposed, since in row-major
                              N,
                              M,
                              K,
                              &alpha,
                              B.data<float>(), //already in GPU 
                              N, // lda, transposed
                              strideB, 
                              A.data<float>(), //already in GPU 
                              K, // ldb, transposed
                              strideA, 
                              &beta, 
                              C.data<float>(),//already in GPU 
                              N, // ldc
                              strideC, 
                              batchCount);
    cublasDestroy(handle);
    if(ret != CUBLAS_STATUS_SUCCESS){
      printf("cublasSgemmStridedBatched failed %d line (%d)\n", ret, __LINE__);
    }

The above code can't get the work done and keeps showing cublasSgemmStridedBatched failed 7, which according to manual represents CUBLAS_STATUS_INVALID_VALUE . Any help or suggestion is appreciated!


Solution

  • Here is a minimal version that works and tests the result:

    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    #include <cstdio>
    #include <Eigen/Dense>
    
    int main()
    {
      cublasHandle_t cubl;
      cublasCreate(&cubl);
      int batch_count = 60000;
      int M = 20;
      int K = 9;
      int N = 32;
      float* A, *B, *C;
      cudaMallocManaged(&A, sizeof(float) * batch_count * M * K);
      cudaMallocManaged(&B, sizeof(float) * K * N);
      cudaMallocManaged(&C, sizeof(float) * batch_count * M * N);
      for(int b = 0; b < batch_count; ++b)
        for(int m = 0; m < M; ++m)
          for(int k = 0; k < K; ++k)
        A[((b * M) + m) * K + k] = (float)(b + 1) * (m + 2) * (k + 3) / (M*N*K);
      for(int k = 0; k < K; ++k)
        for(int n = 0; n < N; ++n)
          B[k * N + n] = (float) (k + 1) * (n + 2) / (N*K);
      const float alpha = 1.f, beta = 0.f;
      const int strideA = K * M, strideB = 0, strideC = M * N;
      cublasStatus_t ret = cublasSgemmStridedBatched(
        cubl, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
        B, N /*lda*/, strideB,
        A, K /*ldb*/, strideA, &beta,
        C, N /*ldc*/, strideC, batch_count);
       if(ret != CUBLAS_STATUS_SUCCESS)
         std::printf("cublasSgemmStridedBatched failed %d line (%d)\n",
                     ret, __LINE__);
       cudaError_t curet = cudaDeviceSynchronize();
       std::printf("Device sync: %d\n", ret);
       Eigen::ArrayXXf reference, error = Eigen::ArrayXXf::Zero(N, M);
       const auto B_map = Eigen::MatrixXf::Map(B, N, K);
       for(int b = 0; b < batch_count; ++b) {
         const auto A_map = Eigen::MatrixXf::Map(A + strideA * b, K, M);
         reference.matrix().noalias() = B_map * A_map;
         const auto C_map = Eigen::ArrayXXf::Map(C + strideC * b, N, M);
         const auto rel_error = (C_map - reference).abs() /
               C_map.abs().max(reference.abs());
         error = error.max(rel_error);
       }
       std::printf("Max relative error %g\n", error.maxCoeff());
    }
    

    Reports a maximum relative error of 2.5e-7