cgccgpuopenmpoffloading

Perform a triple pointer (C) offloading to NVIDIA GPU with OpenMP


I've been working with a heat transfer code. This code, basically, stablishes the initial conditions for a cube and all of its faces. The six faces start at different temperatures, and then the code will be calculating how the temperature changes in all of the faces due to the heat transfer between them. Now, I've been trying offloading to an NVIDIA GPU using OpenMP directives. This code initializes the faces conditions using a triple pointer, which is sort of an array of arrays. Reading a little bit about this matter, I've come to know that 3D architectures are not easily offloaded to the GPUs. So my question is if it is possible to offload this triple pointer arrays to the GPU or if I have to use a more flat array form.

Here I leave the code, which is still working on CPU. Parallel version of the code.

#include <omp.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define N 25 //This defines the number of points per dimension (Cube = N*N*N)
#define NUM_STEPS 6000 //This is the number of simulations time steps

/*writeFile: this function writes simulation results into a file. 
 * A file is created for each iteration that's passed to the function 
 * as a parameter. It also takes the triple pointer to the simulation 
 * data*/
void writeFile(int iteration, double*** data){
    char filename[50];
    char itr[12];
    sprintf(itr, "%d", iteration);

    strcpy(filename, "heat_");
    strcat(filename, itr);
    strcat(filename, ".txt");
    //printf("Filename is %s\n", filename);

    FILE *fp;
    fp = fopen(filename, "w");

    fprintf(fp, "x,y,z,T\n");
    for(int i=0; i<N; i++){
        for(int j=0;j<N; j++){
            for(int k=0; k<N; k++){
                fprintf(fp,"%d,%d,%d,%f\n", i,j,k,data[i][j][k]);
            }
        }
    }
    fclose(fp);
}
void compute_heat_transfer(double ***arrayOld, double ***arrayNew){
    int i,j,k;
    /*Compute steady-state solution*/
    for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
        /*if(nsteps % 100 == 0){
            writeFile(nsteps, arrayOld);
        }*/
        #pragma omp parallel shared(arrayNew, arrayOld) private(i,j,k)
        {
            #pragma omp for
            for(i=1; i<N-1; i++){
                for(j=1; j<N-1; j++){
                    for(k=1;k<N-1;k++){
                        //This is the 6-neighbor stencil computation
                        arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
                                 arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
                    }
                }
            }
            #pragma omp for
            for(i=1; i<N-1; i++){
                for(j=1; j<N-1; j++){
                    for(k=1; k<N-1; k++){
                        arrayOld[i][j][k] = arrayNew[i][j][k];
                    }   
                }
            }
        }
    }
}

int main (int argc, char *argv[]) {
    int i,j,k,nsteps; 
    double mean;
    double ***arrayOld; //Variable that will hold the data of the past iteration
    double ***arrayNew; //Variable where newly computed data will be stored
    arrayOld = (double***)malloc(N*sizeof(double**));
    arrayNew = (double***)malloc(N*sizeof(double**));
    if(arrayOld== NULL){
        fprintf(stderr, "Out of memory");
        exit(0);
    }
    for(i=0; i<N;i++){
        arrayOld[i] = (double**)malloc(N*sizeof(double*));
        arrayNew[i] = (double**)malloc(N*sizeof(double*));
        if(arrayOld[i]==NULL){
            fprintf(stderr, "Out of memory");
            exit(0);
        }
        for(int j=0;j<N;j++){
            arrayOld[i][j] = (double*)malloc(N*sizeof(double));
            arrayNew[i][j] = (double*)malloc(N*sizeof(double));
            if(arrayOld[i][j]==NULL){
                fprintf(stderr,"Out of memory");
                exit(0);
            }
        }
    }

    /*Set boundary values and compute mean boundary values*/
    mean = 0.0;

    for(i=0; i<N; i++){
        for(j=0;j<N;j++){
            arrayOld[i][j][0] = 100.0;
            mean += arrayOld[i][j][0];
        }
    }

    for(i=0; i<N; i++){
        for(j=0;j<N;j++){
            arrayOld[i][j][N-1] = 100.0;
            mean += arrayOld[i][j][N-1];
        }
    }

    for(j=0; j<N; j++){
        for(k=0;k<N;k++){
            arrayOld[0][j][k] = 100.0;
            mean += arrayOld[0][j][k];
        }
    }
    
    for(j=0; j<N; j++){
        for(k=0;k<N;k++){
            arrayOld[N-1][j][k] = 100.0;
            mean += arrayOld[N-1][j][k];
        }
    }

    for(i=0; i<N; i++){
        for(k=0;k<N;k++){
            arrayOld[i][0][k] = 100.0;
            mean += arrayOld[i][0][k];
        }
    }
    
    for(i=0; i<N; i++){
        for(k=0;k<N;k++){
            arrayOld[i][N-1][k] = 0.0;
            mean += arrayOld[i][N-1][k];
        }
    }

    mean /= (6.0 * (N*N));

    /*Initialize interior values*/
    for(i=1; i<N-1; i++){
        for(j=1; j<N-1; j++){
            for(k=1; k<N-1;k++){
                arrayOld[i][j][k] = mean;
            }
        }
    }

    double tdata = omp_get_wtime();

    compute_heat_transfer(arrayOld, arrayNew);

    tdata = omp_get_wtime()-tdata;

    printf("Execution time was %f secs\n", tdata);
    
    for(i=0; i<N;i++){
        for(int j=0;j<N;j++){
            free(arrayOld[i][j]);
            free(arrayNew[i][j]);
        }
        free(arrayOld[i]);
        free(arrayNew[i]);
    }
    free(arrayOld);
    free(arrayNew);
    
    return 0;
}

Solution

  • Use variable length arrays with dynamic storage:

    1. Allocation:
    double (*arr)[N][N] = calloc(N, sizeof *arr);
    
    1. Indexing. Use good old arr[i][j][k] syntax

    2. Deallocation.

    free(arr)
    
    1. Flattening.
    double *flat = (double*)arr;
    

    Note that this conversion is not guaranteed by the C standard to work. Though it will very likely work on all platforms capable of using GPUs.

    1. Passing to functions. VLAs can be parameters of the functions.
    void fun(int n, double arr[n][n][n]) {
      ...
    }
    

    Exemplary usage would be:

    foo(N, arr);
    

    EDIT

    VLA friendly variant of compute_heat_transfer():

    void compute_heat_transfer(int n, double arrayOld[restrict n][n][n], double arrayNew[restrict n][n][n]) {
        int i,j,k;
        /*Compute steady-state solution*/
        for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
            /*if(nsteps % 100 == 0){
                writeFile(nsteps, arrayOld);
            }*/
            #pragma omp parallel for collapse(3) 
            for(i=1; i<n-1; i++){
            for(j=1; j<n-1; j++){
            for(k=1; k<n-1; k++){
               //This is the 6-neighbor stencil computation
              arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
                                     arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
            }}}
            #pragma omp parallel for collapse(3)
            for(i=1; i<n-1; i++){
            for(j=1; j<n-1; j++){
            for(k=1; k<n-1; k++){
              arrayOld[i][j][k] = arrayNew[i][j][k];
            }}}
        }
    }
    

    Keyword restrict in arrNew[restrict n][n][n] is used to let the compiler assume that arrNew and arrOld do not alias. It should let the compiler use more aggressive optimizations.

    Note that arrNew and arrOld are pointer to arrays. So rather than copy arrNew to arrOld you could simply swap those pointers forming a kind of simple double buffering. It should make the code even faster.