Recently I started to study the CUDA technology, which allows me to increase the speed of calculations by several times thanks to the paralleling technology. The program I wrote has to calculate the determinant of a square matrix using the Gaussian method. Unfortunately, for some reason it always tells me that the determinant of a matrix is equal to one, although it is not. In addition, the time is also shown incorrectly. Most likely, there is an error in calculate_determinant function, but I haven't been able to find it.
I work in Ubuntu 22.02 and this is how I compile my program:
nvcc test2.cu -o test2
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include <clocale>
#include <CL/cl.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define MAX_SIZE 10000
#define BLOCK_SIZE 512
#define THREAD_SIZE 1024
__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
int j = threadIdx.x + blockDim.x * blockIdx.x;
double ratio;
int i, k;
double det_local;
// Initialize local determinant to 1 for all threads
if (threadIdx.x == 0) {
det_local = 1.0;
}
__syncthreads();
if (j < size) {
for (i = 0; i < size - 1; i++) {
// Synchronize threads to ensure previous row operations are complete
__syncthreads();
// Perform row operations on matrix elements below the diagonal
if (j > i) {
if (device_matrix[i * size + i]) {
ratio = device_matrix[j * size + i] / device_matrix[i * size + i];
} else {
ratio = 1;
}
for (k = i; k < size; k++) {
device_matrix[j * size + k] -= ratio * device_matrix[i * size + k];
}
} else {
break;
}
}
// Multiply local determinant by diagonal element in the last row processed by this thread
det_local *= device_matrix[(j)*size + j];
}
// Thread 0 writes the final determinant to global memory
if (threadIdx.x == 0) {
*det = det_local;
}
}
__host__ double* build_matrix(uint32_t size) {
uint32_t i, j;
double* matrix = (double*)malloc(size * size * sizeof(double));
if (matrix == NULL) {
printf("Memory allocation error in build_matrix");
exit(2);
}
printf("OG matrix:\n");
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
matrix[i * size + j] = rand() % 10;
printf("%.3f ", matrix[i * size + j]);
}
printf("\n");
}
return matrix;
}
__host__ int main() {
uint32_t size, i;
printf("Enter the size of the matrix: ");
scanf("%u", &size);
if (size > MAX_SIZE) {
printf("The matrix is too big");
return 1;
}
srand((unsigned)time(NULL));
double* matrix = build_matrix(size);
double* device_matrix;
double host_det = 1.0;
double* device_det;
// Allocate memory on device
cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
cudaMalloc((void**)&device_det, sizeof(double));
// Copy matrix and determinant from host to device
cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);
// Recording time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//float start2 = clock();
calculate_determinant <<<BLOCK_SIZE+1, THREAD_SIZE >>> (device_matrix, size, device_det);
cudaThreadSynchronize();
// float end = clock();
// Recording time
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
// Copy determinant back from device to host
cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(device_matrix);
cudaFree(device_det);
free(matrix);
printf("Determinant is : %.3f\n", host_det);
printf("Time elapsed: %.2f\n", elapsedTime);
// printf("Time of execution = %.2f\n", end - start2);
return 0;
}
I've tried different ways, including changing the way I form a matrix, but there is still no result. Maybe there are some experienced CUDA users who can share their experience?
It wasn't evident to me what algorithm you were trying to use from your code. Therefore I did a google search, found this and implemented that.
Example:
$ cat t2234.cu
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#define THREAD_SIZE 1024 // must be power-of-2, between 2 and 1024
#define MAX_SIZE THREAD_SIZE
__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
__shared__ double smem[THREAD_SIZE];
int col = threadIdx.x; // requires changes to work with more than 1 block
double ratio;
int row, tcol;
for (tcol = 0; tcol < size-1; tcol++)
for (row = tcol+1; row < size; row++) {
// Perform row operations on matrix elements to zero out elements below main diagonal
ratio = device_matrix[row * size + tcol] / device_matrix[tcol * size + tcol];
__syncthreads();
device_matrix[row * size + col] -= ratio * device_matrix[tcol * size + col];
// Synchronize threads to ensure previous row operations are complete
__syncthreads();
}
// compute determinant - parallel sweep product reduction
smem[threadIdx.x] = device_matrix[threadIdx.x*size+threadIdx.x];
for (int st = THREAD_SIZE/2; st > 0; st>>=1){
__syncthreads();
if ((col<st)&&((col+st)<blockDim.x)) smem[col] *= smem[col+st];}
if (!threadIdx.x) *det = smem[0];
}
__host__ double* build_matrix(uint32_t size) {
uint32_t i, j;
double* matrix = (double*)malloc(size * size * sizeof(double));
if (matrix == NULL) {
printf("Memory allocation error in build_matrix");
exit(2);
}
printf("OG matrix:\n");
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
matrix[i * size + j] = rand() % 10;
if ((i ==0) && (matrix[i*size+j] == 0)) matrix[i*size+j] = 1;
printf("%.3f ", matrix[i * size + j]);
}
printf("\n");
}
return matrix;
}
__host__ int main() {
uint32_t size;
printf("Enter the size of the matrix: ");
scanf("%u", &size);
if (size > MAX_SIZE) {
printf("The matrix is too big");
return 1;
}
srand((unsigned)time(NULL));
double* matrix = build_matrix(size);
double* device_matrix;
double host_det = 1.0;
double* device_det;
// Allocate memory on device
cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
cudaMalloc((void**)&device_det, sizeof(double));
// Copy matrix and determinant from host to device
cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);
// Recording time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//float start2 = clock();
calculate_determinant <<<1, size >>> (device_matrix, size, device_det);
cudaDeviceSynchronize();
// float end = clock();
// Recording time
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
// Copy determinant back from device to host
cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(device_matrix);
cudaFree(device_det);
free(matrix);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("CUDA error: %s\n", cudaGetErrorString(err));
printf("Determinant is : %.3f\n", host_det);
printf("Time elapsed: %.2f\n", elapsedTime);
// printf("Time of execution = %.2f\n", end - start2);
return 0;
}
$ nvcc -o t2234 t2234.cu
$ ./t2234
Enter the size of the matrix: 3
OG matrix:
5.000 1.000 8.000
1.000 3.000 2.000
9.000 3.000 2.000
Determinant is : -176.000
Time elapsed: 0.05
$
compute-sanitizer
to see if you have any system/hardware/cuda-setup problems.