I'm trying to do a matrix-matrix multiplication using Cublas but it still not work and I don't figure out the problem. Since it is the first time I use Cublas I'm not sure that I set the right parameter, especially for the leading dimension
For example:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[i * 3 + j]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, size * sizeof(double));
cudaMalloc((void**)&dev_b, 12 * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 3, dev_b, 3, &beta, dev_c, 3);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
The two matrix used are:
1 2 3
4 5 6
7 8 9
10 11 12
10 20 30
40 50 60
70 80 90
while the output is:
** On entry to DGEMM parameter number 8 had an illegal value
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
There are 3 problems with your code.
You are not checking for CUDA errors or cuBLAS errors. CUDA error checking is described here What is the canonical way to check for errors using the CUDA runtime API?
With proper error checking you will find that cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
fails because of illegal memory accesses. dev_a
and dev_b
have been allocated with the wrong size. It should be 12 for dev_a and size for dev_b .
You make wrong assumption about the memory layout of the matrix. cuBLAS uses a column-major storage format. https://docs.nvidia.com/cuda/cublas/index.html#data-layout
This means that the leading dimensions of A and C are 4, not 3. This also means that A and B are
1 5 9
2 6 10
3 7 11
4 8 12
and
10 40 70
20 50 80
30 60 90
,respectively
Printing of C must also be changed to account for column-major format
Working code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[j * 4 + i]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, 12 * sizeof(double));
cudaMalloc((void**)&dev_b, size * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 4, dev_b, 3, &beta, dev_c, 4);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
Output
380.000000 830.000000 1280.000000
440.000000 980.000000 1520.000000
500.000000 1130.000000 1760.000000
560.000000 1280.000000 2000.000000