[SOLVED] Use Duplicated Matrix in CUBLAS batched operations

Use Duplicated Matrix in CUBLAS batched operations

While cublas<t>gemmBatched will do multiplications of A1*B1, A2*B2, A3*B3, ..., and AN*BN, I want all B's to be the same, and do not want to spend memory on B's.

This is my solution so far, which required me to make double *d_B2[numBatch] to transfer the batched pointers to double **d_BPtr via cudaMemcpy.

#include <cstdio>
#include <cstdlib>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define N 3
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

int main() {

  const int numBatch = 4;

  // Want to do A1*B, A2*B, A3*B, A4*B with batch CUBLAS

  // Host
  double h_A1[N*N], h_A2[N*N], h_A3[N*N], h_A4[N*N]; // matrices A1~A4.

  h_A1[IDX2C(0,0,N)] = 0.5; h_A1[IDX2C(0,1,N)] = 3. ; h_A1[IDX2C(0,2,N)] = 4. ;
  h_A1[IDX2C(1,0,N)] = 1. ; h_A1[IDX2C(1,1,N)] = 3. ; h_A1[IDX2C(1,2,N)] = 10.;
  h_A1[IDX2C(2,0,N)] = 4. ; h_A1[IDX2C(2,1,N)] = 9. ; h_A1[IDX2C(2,2,N)] = 16.;

  h_A2[IDX2C(0,0,N)] = 0. ; h_A2[IDX2C(0,1,N)] = 3. ; h_A2[IDX2C(0,2,N)] = 4. ;
  h_A2[IDX2C(1,0,N)] = 1. ; h_A2[IDX2C(1,1,N)] = 3. ; h_A2[IDX2C(1,2,N)] = 10.;
  h_A2[IDX2C(2,0,N)] = 4. ; h_A2[IDX2C(2,1,N)] = 9. ; h_A2[IDX2C(2,2,N)] = 16.;

  h_A3[IDX2C(0,0,N)] = 0. ; h_A3[IDX2C(0,1,N)] = 1. ; h_A3[IDX2C(0,2,N)] = 4. ;
  h_A3[IDX2C(1,0,N)] = 3. ; h_A3[IDX2C(1,1,N)] = 3. ; h_A3[IDX2C(1,2,N)] = 9. ;
  h_A3[IDX2C(2,0,N)] = 4. ; h_A3[IDX2C(2,1,N)] = 10.; h_A3[IDX2C(2,2,N)] = 16.;

  h_A4[IDX2C(0,0,N)] = 0. ; h_A4[IDX2C(0,1,N)] = 3. ; h_A4[IDX2C(0,2,N)] = 4. ;
  h_A4[IDX2C(1,0,N)] = 1. ; h_A4[IDX2C(1,1,N)] = 5. ; h_A4[IDX2C(1,2,N)] = 6. ;
  h_A4[IDX2C(2,0,N)] = 9. ; h_A4[IDX2C(2,1,N)] = 8. ; h_A4[IDX2C(2,2,N)] = 2. ;

  double *h_APtr[numBatch]; // Batch of matrices

  h_APtr[0] = &(h_A1[0]);
  h_APtr[1] = &(h_A2[0]);
  h_APtr[2] = &(h_A3[0]);
  h_APtr[3] = &(h_A4[0]);

  double h_B[N*N]; // matrix B

  h_B[IDX2C(0,0,N)] = 1. ; h_B[IDX2C(0,1,N)] = 2. ; h_B[IDX2C(0,2,N)] = 3. ;
  h_B[IDX2C(1,0,N)] = 1. ; h_B[IDX2C(1,1,N)] = 2. ; h_B[IDX2C(1,2,N)] = 3. ;
  h_B[IDX2C(2,0,N)] = 1. ; h_B[IDX2C(2,1,N)] = 2. ; h_B[IDX2C(2,2,N)] = 3. ;

  double h_Sol1[N*N], h_Sol2[N*N], h_Sol3[N*N], h_Sol4[N*N]; // Arrays to hold the results
  double *h_SolPtr[numBatch];
  h_SolPtr[0] = &(h_Sol1[0]);
  h_SolPtr[1] = &(h_Sol2[0]);
  h_SolPtr[2] = &(h_Sol3[0]);
  h_SolPtr[3] = &(h_Sol4[0]);

  // Device - relevant to A's
  double *d_A[numBatch], **d_APtr;
  for (int i=0; i<numBatch; i++) {
    cudaMalloc((void**)&d_A[i], N*N*sizeof(double));
    cudaMemcpy(d_A[i], h_APtr[i], N*N*sizeof(double), cudaMemcpyHostToDevice);
  }
  cudaMalloc((void**)&d_APtr, numBatch*sizeof(double *));
  cudaMemcpy(d_APtr, d_A, numBatch*sizeof(double *), cudaMemcpyHostToDevice);

  // Device - relevant to B
  double *d_B, **d_BPtr;
  cudaMalloc((void**)&d_B, N*N*sizeof(double));
  cudaMalloc((void**)&d_BPtr, numBatch*sizeof(double *));
  cudaMemcpy(d_B, h_B, N*N*sizeof(double), cudaMemcpyHostToDevice);

  double *d_B2[numBatch];
  for (int i=0; i<numBatch; i++) {
    d_B2[i] = d_B;
  }
  cudaMemcpy(d_BPtr, d_B2, numBatch*sizeof(double *), cudaMemcpyHostToDevice);

  // Device - relevant to Sol's
  double *d_Sol[numBatch], **d_SolPtr;
  for (int i=0; i<numBatch; i++) {
    cudaMalloc((void**)&d_Sol[i], N*N*sizeof(double));
  }
  cudaMalloc((void**)&d_SolPtr, numBatch*sizeof(double *));
  cudaMemcpy(d_SolPtr, d_Sol, numBatch*sizeof(double *), cudaMemcpyHostToDevice);

  // CUBLAS
  cublasHandle_t myhandle;
  cublasStatus_t cublas_result;

  cublas_result = cublasCreate_v2(&myhandle);

  // gemmBatched
  double alpha = 1.0;
  double beta  = 0.0;
  cublas_result = cublasDgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N,
                                     N,N,N,
                                     &alpha, d_APtr,   N, d_BPtr, N,
                                     &beta,  d_SolPtr, N,
                                     numBatch);
  assert(cublas_result == CUBLAS_STATUS_SUCCESS);

  // print the results
  for (int i=0; i<numBatch; i++) {
    cudaMemcpy(h_SolPtr[i], d_Sol[i], N*N*sizeof(double), cudaMemcpyDeviceToHost);
  }

  double *tmpPtr, *tmpPtr2;
  for (int k=0; k<numBatch; k++) {
    tmpPtr  = h_APtr[k];
    tmpPtr2 = h_SolPtr[k];

    for (int i=0; i<N; i++) {
      for (int j=0; j<N; j++) {
        printf("%f\t", tmpPtr[IDX2C(i,j,N)]);
      }

      for (int j=0; j<N; j++) {
        printf("%f\t", tmpPtr2[IDX2C(i,j,N)]);
      }

      printf("\n");
    }
    printf("\n");

  }

  // free

  return 0;
}

I think this is ugly, and it should be possible to make this work without extra d_B2[numBatch]. What I have in mind is something like the below, but things does not work in this way.

  double *d_B, **d_BPtr;

  cudaMalloc((void**)&d_B, N*N*sizeof(double));
  cudaMalloc((void**)&d_BPtr, numBatch*sizeof(double *));

  cudaMemcpy(d_B, h_B, N*N*sizeof(double), cudaMemcpyHostToDevice);
  for (int i=0; i<numBatch; i++) {
    cudaMemcpy((char *)d_BPtr + i*sizeof(double *), d_B, sizeof(double *), cudaMemcpyHostToDevice);
  }

Would there be a good method to use duplicated matrices for CUBLAS batched operations without spending memory on it?

Solution

You problem reduces to filling the entries of d_BPtr with the value of d_B. Something like this should work (completely untested):

  #include <thrust/fill.h>
  #include <thrust/execution_policy.h>
  ...

  double *d_B, **d_BPtr;
  cudaMalloc((void**)&d_B, N*N*sizeof(double));
  cudaMalloc((void**)&d_BPtr, numBatch*sizeof(double *));
  cudaMemcpy(d_B, h_B, N*N*sizeof(double), cudaMemcpyHostToDevice);

  thrust::fill(thrust::device, d_BPtr, d_BPtr+numBatch, d_B);