While cublas<t>gemmBatched
will do multiplications of A1*B1, A2*B2, A3*B3, ..., and AN*BN, I want all B's to be the same, and do not want to spend memory on B's.
This is my solution so far, which required me to make double *d_B2[numBatch]
to transfer the batched pointers to double **d_BPtr
via cudaMemcpy
.
#include <cstdio>
#include <cstdlib>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define N 3
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
int main() {
const int numBatch = 4;
// Want to do A1*B, A2*B, A3*B, A4*B with batch CUBLAS
// Host
double h_A1[N*N], h_A2[N*N], h_A3[N*N], h_A4[N*N]; // matrices A1~A4.
h_A1[IDX2C(0,0,N)] = 0.5; h_A1[IDX2C(0,1,N)] = 3. ; h_A1[IDX2C(0,2,N)] = 4. ;
h_A1[IDX2C(1,0,N)] = 1. ; h_A1[IDX2C(1,1,N)] = 3. ; h_A1[IDX2C(1,2,N)] = 10.;
h_A1[IDX2C(2,0,N)] = 4. ; h_A1[IDX2C(2,1,N)] = 9. ; h_A1[IDX2C(2,2,N)] = 16.;
h_A2[IDX2C(0,0,N)] = 0. ; h_A2[IDX2C(0,1,N)] = 3. ; h_A2[IDX2C(0,2,N)] = 4. ;
h_A2[IDX2C(1,0,N)] = 1. ; h_A2[IDX2C(1,1,N)] = 3. ; h_A2[IDX2C(1,2,N)] = 10.;
h_A2[IDX2C(2,0,N)] = 4. ; h_A2[IDX2C(2,1,N)] = 9. ; h_A2[IDX2C(2,2,N)] = 16.;
h_A3[IDX2C(0,0,N)] = 0. ; h_A3[IDX2C(0,1,N)] = 1. ; h_A3[IDX2C(0,2,N)] = 4. ;
h_A3[IDX2C(1,0,N)] = 3. ; h_A3[IDX2C(1,1,N)] = 3. ; h_A3[IDX2C(1,2,N)] = 9. ;
h_A3[IDX2C(2,0,N)] = 4. ; h_A3[IDX2C(2,1,N)] = 10.; h_A3[IDX2C(2,2,N)] = 16.;
h_A4[IDX2C(0,0,N)] = 0. ; h_A4[IDX2C(0,1,N)] = 3. ; h_A4[IDX2C(0,2,N)] = 4. ;
h_A4[IDX2C(1,0,N)] = 1. ; h_A4[IDX2C(1,1,N)] = 5. ; h_A4[IDX2C(1,2,N)] = 6. ;
h_A4[IDX2C(2,0,N)] = 9. ; h_A4[IDX2C(2,1,N)] = 8. ; h_A4[IDX2C(2,2,N)] = 2. ;
double *h_APtr[numBatch]; // Batch of matrices
h_APtr[0] = &(h_A1[0]);
h_APtr[1] = &(h_A2[0]);
h_APtr[2] = &(h_A3[0]);
h_APtr[3] = &(h_A4[0]);
double h_B[N*N]; // matrix B
h_B[IDX2C(0,0,N)] = 1. ; h_B[IDX2C(0,1,N)] = 2. ; h_B[IDX2C(0,2,N)] = 3. ;
h_B[IDX2C(1,0,N)] = 1. ; h_B[IDX2C(1,1,N)] = 2. ; h_B[IDX2C(1,2,N)] = 3. ;
h_B[IDX2C(2,0,N)] = 1. ; h_B[IDX2C(2,1,N)] = 2. ; h_B[IDX2C(2,2,N)] = 3. ;
double h_Sol1[N*N], h_Sol2[N*N], h_Sol3[N*N], h_Sol4[N*N]; // Arrays to hold the results
double *h_SolPtr[numBatch];
h_SolPtr[0] = &(h_Sol1[0]);
h_SolPtr[1] = &(h_Sol2[0]);
h_SolPtr[2] = &(h_Sol3[0]);
h_SolPtr[3] = &(h_Sol4[0]);
// Device - relevant to A's
double *d_A[numBatch], **d_APtr;
for (int i=0; i<numBatch; i++) {
cudaMalloc((void**)&d_A[i], N*N*sizeof(double));
cudaMemcpy(d_A[i], h_APtr[i], N*N*sizeof(double), cudaMemcpyHostToDevice);
}
cudaMalloc((void**)&d_APtr, numBatch*sizeof(double *));
cudaMemcpy(d_APtr, d_A, numBatch*sizeof(double *), cudaMemcpyHostToDevice);
// Device - relevant to B
double *d_B, **d_BPtr;
cudaMalloc((void**)&d_B, N*N*sizeof(double));
cudaMalloc((void**)&d_BPtr, numBatch*sizeof(double *));
cudaMemcpy(d_B, h_B, N*N*sizeof(double), cudaMemcpyHostToDevice);
double *d_B2[numBatch];
for (int i=0; i<numBatch; i++) {
d_B2[i] = d_B;
}
cudaMemcpy(d_BPtr, d_B2, numBatch*sizeof(double *), cudaMemcpyHostToDevice);
// Device - relevant to Sol's
double *d_Sol[numBatch], **d_SolPtr;
for (int i=0; i<numBatch; i++) {
cudaMalloc((void**)&d_Sol[i], N*N*sizeof(double));
}
cudaMalloc((void**)&d_SolPtr, numBatch*sizeof(double *));
cudaMemcpy(d_SolPtr, d_Sol, numBatch*sizeof(double *), cudaMemcpyHostToDevice);
// CUBLAS
cublasHandle_t myhandle;
cublasStatus_t cublas_result;
cublas_result = cublasCreate_v2(&myhandle);
// gemmBatched
double alpha = 1.0;
double beta = 0.0;
cublas_result = cublasDgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N,
N,N,N,
&alpha, d_APtr, N, d_BPtr, N,
&beta, d_SolPtr, N,
numBatch);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
// print the results
for (int i=0; i<numBatch; i++) {
cudaMemcpy(h_SolPtr[i], d_Sol[i], N*N*sizeof(double), cudaMemcpyDeviceToHost);
}
double *tmpPtr, *tmpPtr2;
for (int k=0; k<numBatch; k++) {
tmpPtr = h_APtr[k];
tmpPtr2 = h_SolPtr[k];
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
printf("%f\t", tmpPtr[IDX2C(i,j,N)]);
}
for (int j=0; j<N; j++) {
printf("%f\t", tmpPtr2[IDX2C(i,j,N)]);
}
printf("\n");
}
printf("\n");
}
// free
return 0;
}
I think this is ugly, and it should be possible to make this work without extra d_B2[numBatch]
. What I have in mind is something like the below, but things does not work in this way.
double *d_B, **d_BPtr;
cudaMalloc((void**)&d_B, N*N*sizeof(double));
cudaMalloc((void**)&d_BPtr, numBatch*sizeof(double *));
cudaMemcpy(d_B, h_B, N*N*sizeof(double), cudaMemcpyHostToDevice);
for (int i=0; i<numBatch; i++) {
cudaMemcpy((char *)d_BPtr + i*sizeof(double *), d_B, sizeof(double *), cudaMemcpyHostToDevice);
}
Would there be a good method to use duplicated matrices for CUBLAS batched operations without spending memory on it?
You problem reduces to filling the entries of d_BPtr
with the value of d_B
. Something like this should work (completely untested):
#include <thrust/fill.h>
#include <thrust/execution_policy.h>
...
double *d_B, **d_BPtr;
cudaMalloc((void**)&d_B, N*N*sizeof(double));
cudaMalloc((void**)&d_BPtr, numBatch*sizeof(double *));
cudaMemcpy(d_B, h_B, N*N*sizeof(double), cudaMemcpyHostToDevice);
thrust::fill(thrust::device, d_BPtr, d_BPtr+numBatch, d_B);