With the following test example, the output matrix doesn't give the desired output or maybe I'm misunderstanding certain parameters:
#include <cstdio>
#include <cublas_v2.h>
#include <cuda_runtime.h>
void PrintMatrix(const char *name, float *mat, int M, int N) {
printf("Matrix: %s\n", name);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f, ", mat[i * N + j]);
}
printf("\n");
}
}
int main() {
cublasHandle_t handle;
cublasCreate(&handle);
// Matrix dimensions
int m = 3; // Rows of B
int n = 3; // Columns of B
const float alpha = 1.0f;
// Host matrices (column-major)
float h_A[] = {
1.0f, 0.0f, 0.0f,
2.0f, 3.0f, 0.0f,
4.0f, 5.0f, 6.0f
};
float h_B[] = {
1.0f, 4.0f, 7.0f,
2.0f, 5.0f, 8.0f,
3.0f, 6.0f, 9.0f
};
float h_C[9] = { 0 };
float *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, m * m * sizeof(float));
cudaMalloc((void**)&d_B, m * n * sizeof(float));
cudaMalloc((void**)&d_C, m * n * sizeof(float));
cudaMemcpy(d_A, h_A, m * m * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, m * n * sizeof(float), cudaMemcpyHostToDevice);
// Perform B := alpha * A * B, A is lower triangular
cublasStrmm_64(
handle,
CUBLAS_SIDE_LEFT, // A on the left
CUBLAS_FILL_MODE_LOWER, // A is lower triangular
CUBLAS_OP_N, // No transpose
CUBLAS_DIAG_NON_UNIT, // A is non-unit triangular
m, // rows of B
n, // cols of B
&alpha,
d_A, m, // A, lda
d_B, m, // B, ldb
d_C, m // Output is written to C
);
// Copy result back to host
cudaMemcpy(h_C, d_C, m * n * sizeof(float), cudaMemcpyDeviceToHost);
PrintMatrix("A", h_A, m, n);
PrintMatrix("B", h_B, m, n);
PrintMatrix("C", h_C, m, n);
// Cleanup
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasDestroy(handle);
}
The output of this program is:
Matrix: A
1.00, 0.00, 0.00,
2.00, 3.00, 0.00,
4.00, 5.00, 6.00,
Matrix: B
1.00, 4.00, 7.00,
2.00, 5.00, 8.00,
3.00, 6.00, 9.00,
Matrix: C
1.00, 12.00, 42.00,
2.00, 15.00, 48.00,
3.00, 18.00, 54.00,
How does A*B result in C?
The main reason the output matrix doesn't match your expectation is due to the way CUBLAS interprets C and C++ arrays.
Quoting from the Data Layout Section in the CUBLAS docs -
For maximum compatibility with existing Fortran environments, the cuBLAS library uses column-major storage, and 1-based indexing. Since C and C++ use row-major storage, applications written in these languages can not use the native array semantics for two-dimensional arrays
(emphasis mine)
As a result your matrix A defined as -
float h_A[] = {
1.0f, 0.0f, 0.0f,
2.0f, 3.0f, 0.0f,
4.0f, 5.0f, 6.0f
};
is interpreted in column major format as -
1.0 2.0 4.0
0.0 3.0 5.0
0.0 0.0 6.0
Now, the CUBLAS_FILL_MODE_LOWER
argument to cublasStrmm_64
asks CUBLAS to interpret the matrix as lower triangular meaning the upper triangle is ignored. As a result the matrix multiplication is performed with the matrix -
1.0 0.0 0.0
0.0 3.0 0.0
0.0 0.0 6.0
Multiplication with this diagonal matrix explains the results you see in C.
To fix this, you can either change the parameter to CUBLAS_FILL_MODE_UPPER
or change the definition of h_A
to
float h_A[] = {
1.0f, 2.0f, 4.0f,
0.0f, 3.0f, 5.0f,
0.0f, 0.0f, 6.0f
};
Whichever makes sense for your application.