ubuntuparallel-processingcudadeterminants

matrix determinant in CUDA


Recently I started to study the CUDA technology, which allows me to increase the speed of calculations by several times thanks to the paralleling technology. The program I wrote has to calculate the determinant of a square matrix using the Gaussian method. Unfortunately, for some reason it always tells me that the determinant of a matrix is equal to one, although it is not. In addition, the time is also shown incorrectly. Most likely, there is an error in calculate_determinant function, but I haven't been able to find it.

I work in Ubuntu 22.02 and this is how I compile my program: nvcc test2.cu -o test2

The output of the ./test2

#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include <clocale>
#include <CL/cl.h>
#include <cuda_runtime.h>
#include <cuda.h>

#define MAX_SIZE 10000
#define BLOCK_SIZE 512
#define THREAD_SIZE 1024

__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
     int j = threadIdx.x + blockDim.x * blockIdx.x;
    double ratio;
    int i, k;
    double det_local;

    // Initialize local determinant to 1 for all threads
    if (threadIdx.x == 0) {
        det_local = 1.0;
    }
    __syncthreads();

    if (j < size) {
        for (i = 0; i < size - 1; i++) {
            // Synchronize threads to ensure previous row operations are complete
            __syncthreads();

            // Perform row operations on matrix elements below the diagonal
            if (j > i) {
                if (device_matrix[i * size + i]) {
                    ratio = device_matrix[j * size + i] / device_matrix[i * size + i];
                } else {
                    ratio = 1;
                }
                for (k = i; k < size; k++) {
                    device_matrix[j * size + k] -= ratio * device_matrix[i * size + k];
                }
            } else {
                break;
            }
        }
        // Multiply local determinant by diagonal element in the last row processed by this thread
        det_local *= device_matrix[(j)*size + j];
    }

    // Thread 0 writes the final determinant to global memory
    if (threadIdx.x == 0) {
        *det = det_local;
    }
}


__host__ double* build_matrix(uint32_t size) {
    uint32_t i, j;
    double* matrix = (double*)malloc(size * size * sizeof(double));
    if (matrix == NULL) {
        printf("Memory allocation error in build_matrix");
        exit(2);
    }

    printf("OG matrix:\n");
    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            matrix[i * size + j] = rand() % 10;
            printf("%.3f ", matrix[i * size + j]);
        }
        printf("\n");
    }

    return matrix;
}

__host__ int main() {

    uint32_t size, i;
    printf("Enter the size of the matrix: ");
    scanf("%u", &size);
    if (size > MAX_SIZE) {
        printf("The matrix is too big");
        return 1;
    }

    srand((unsigned)time(NULL));

    double* matrix = build_matrix(size);
    double* device_matrix;
    double host_det = 1.0;
    double* device_det;

    // Allocate memory on device
    cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
    cudaMalloc((void**)&device_det, sizeof(double));

    // Copy matrix and determinant from host to device
    cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);

    //  Recording time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    //float start2 = clock();


   
    calculate_determinant <<<BLOCK_SIZE+1, THREAD_SIZE >>> (device_matrix, size, device_det);
    cudaThreadSynchronize();

   // float end = clock();


    // Recording time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);


    // Copy determinant back from device to host
    cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);

    // Free memory on device
    cudaFree(device_matrix);
    cudaFree(device_det);
    free(matrix);

    printf("Determinant is : %.3f\n", host_det);
    printf("Time elapsed:  %.2f\n", elapsedTime);
   // printf("Time of execution =  %.2f\n", end - start2);

    return 0;
}

I've tried different ways, including changing the way I form a matrix, but there is still no result. Maybe there are some experienced CUDA users who can share their experience?


Solution

  • It wasn't evident to me what algorithm you were trying to use from your code. Therefore I did a google search, found this and implemented that.

    Example:

    $ cat t2234.cu
    #include <stdio.h>
    #include <stdint.h>
    #include <time.h>
    
    #define THREAD_SIZE 1024 // must be power-of-2, between 2 and 1024
    #define MAX_SIZE THREAD_SIZE
    
    __global__ void calculate_determinant(double* device_matrix, int size, double* det) {
        __shared__ double smem[THREAD_SIZE];
        int col = threadIdx.x; // requires changes to work with more than 1 block
        double ratio;
        int row, tcol;
    
        for (tcol = 0; tcol < size-1; tcol++)
          for (row = tcol+1; row < size; row++) {
            // Perform row operations on matrix elements to zero out elements below main diagonal
            ratio = device_matrix[row * size + tcol] / device_matrix[tcol * size + tcol];
            __syncthreads();
            device_matrix[row * size + col] -= ratio * device_matrix[tcol * size + col];
            // Synchronize threads to ensure previous row operations are complete
            __syncthreads();
            }
            // compute determinant - parallel sweep product reduction
        smem[threadIdx.x] = device_matrix[threadIdx.x*size+threadIdx.x];
        for (int st = THREAD_SIZE/2; st > 0; st>>=1){
          __syncthreads();
          if ((col<st)&&((col+st)<blockDim.x)) smem[col] *= smem[col+st];}
        if (!threadIdx.x) *det = smem[0];
    }
    
    
    __host__ double* build_matrix(uint32_t size) {
        uint32_t i, j;
        double* matrix = (double*)malloc(size * size * sizeof(double));
        if (matrix == NULL) {
            printf("Memory allocation error in build_matrix");
            exit(2);
        }
    
        printf("OG matrix:\n");
        for (i = 0; i < size; i++) {
            for (j = 0; j < size; j++) {
                matrix[i * size + j] = rand() % 10;
                if ((i ==0) && (matrix[i*size+j] == 0)) matrix[i*size+j] = 1;
                printf("%.3f ", matrix[i * size + j]);
            }
            printf("\n");
        }
    
        return matrix;
    }
    
    __host__ int main() {
    
        uint32_t size;
        printf("Enter the size of the matrix: ");
        scanf("%u", &size);
        if (size > MAX_SIZE) {
            printf("The matrix is too big");
            return 1;
        }
    
        srand((unsigned)time(NULL));
    
        double* matrix = build_matrix(size);
        double* device_matrix;
        double host_det = 1.0;
        double* device_det;
    
        // Allocate memory on device
        cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
        cudaMalloc((void**)&device_det, sizeof(double));
    
        // Copy matrix and determinant from host to device
        cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
        cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);
    
        //  Recording time
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        //float start2 = clock();
    
    
    
        calculate_determinant <<<1, size >>> (device_matrix, size, device_det);
        cudaDeviceSynchronize();
    
       // float end = clock();
    
    
        // Recording time
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        float elapsedTime;
        cudaEventElapsedTime(&elapsedTime, start, stop);
    
    
        // Copy determinant back from device to host
        cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);
    
        // Free memory on device
        cudaFree(device_matrix);
        cudaFree(device_det);
        free(matrix);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) printf("CUDA error: %s\n", cudaGetErrorString(err));
        printf("Determinant is : %.3f\n", host_det);
        printf("Time elapsed:  %.2f\n", elapsedTime);
       // printf("Time of execution =  %.2f\n", end - start2);
    
        return 0;
    }
    $ nvcc -o t2234 t2234.cu
    $ ./t2234
    Enter the size of the matrix: 3
    OG matrix:
    5.000 1.000 8.000
    1.000 3.000 2.000
    9.000 3.000 2.000
    Determinant is : -176.000
    Time elapsed:  0.05
    $