I've been working with a heat transfer code. This code, basically, stablishes the initial conditions for a cube and all of its faces. The six faces start at different temperatures, and then the code will be calculating how the temperature changes in all of the faces due to the heat transfer between them. Now, I've been trying offloading to an NVIDIA GPU using OpenMP directives. This code initializes the faces conditions using a triple pointer, which is sort of an array of arrays. Reading a little bit about this matter, I've come to know that 3D architectures are not easily offloaded to the GPUs. So my question is if it is possible to offload this triple pointer arrays to the GPU or if I have to use a more flat array form.
Here I leave the code, which is still working on CPU. Parallel version of the code.
#include <omp.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define N 25 //This defines the number of points per dimension (Cube = N*N*N)
#define NUM_STEPS 6000 //This is the number of simulations time steps
/*writeFile: this function writes simulation results into a file.
* A file is created for each iteration that's passed to the function
* as a parameter. It also takes the triple pointer to the simulation
* data*/
void writeFile(int iteration, double*** data){
char filename[50];
char itr[12];
sprintf(itr, "%d", iteration);
strcpy(filename, "heat_");
strcat(filename, itr);
strcat(filename, ".txt");
//printf("Filename is %s\n", filename);
FILE *fp;
fp = fopen(filename, "w");
fprintf(fp, "x,y,z,T\n");
for(int i=0; i<N; i++){
for(int j=0;j<N; j++){
for(int k=0; k<N; k++){
fprintf(fp,"%d,%d,%d,%f\n", i,j,k,data[i][j][k]);
}
}
}
fclose(fp);
}
void compute_heat_transfer(double ***arrayOld, double ***arrayNew){
int i,j,k;
/*Compute steady-state solution*/
for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
/*if(nsteps % 100 == 0){
writeFile(nsteps, arrayOld);
}*/
#pragma omp parallel shared(arrayNew, arrayOld) private(i,j,k)
{
#pragma omp for
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1;k<N-1;k++){
//This is the 6-neighbor stencil computation
arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
}
}
}
#pragma omp for
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1; k<N-1; k++){
arrayOld[i][j][k] = arrayNew[i][j][k];
}
}
}
}
}
}
int main (int argc, char *argv[]) {
int i,j,k,nsteps;
double mean;
double ***arrayOld; //Variable that will hold the data of the past iteration
double ***arrayNew; //Variable where newly computed data will be stored
arrayOld = (double***)malloc(N*sizeof(double**));
arrayNew = (double***)malloc(N*sizeof(double**));
if(arrayOld== NULL){
fprintf(stderr, "Out of memory");
exit(0);
}
for(i=0; i<N;i++){
arrayOld[i] = (double**)malloc(N*sizeof(double*));
arrayNew[i] = (double**)malloc(N*sizeof(double*));
if(arrayOld[i]==NULL){
fprintf(stderr, "Out of memory");
exit(0);
}
for(int j=0;j<N;j++){
arrayOld[i][j] = (double*)malloc(N*sizeof(double));
arrayNew[i][j] = (double*)malloc(N*sizeof(double));
if(arrayOld[i][j]==NULL){
fprintf(stderr,"Out of memory");
exit(0);
}
}
}
/*Set boundary values and compute mean boundary values*/
mean = 0.0;
for(i=0; i<N; i++){
for(j=0;j<N;j++){
arrayOld[i][j][0] = 100.0;
mean += arrayOld[i][j][0];
}
}
for(i=0; i<N; i++){
for(j=0;j<N;j++){
arrayOld[i][j][N-1] = 100.0;
mean += arrayOld[i][j][N-1];
}
}
for(j=0; j<N; j++){
for(k=0;k<N;k++){
arrayOld[0][j][k] = 100.0;
mean += arrayOld[0][j][k];
}
}
for(j=0; j<N; j++){
for(k=0;k<N;k++){
arrayOld[N-1][j][k] = 100.0;
mean += arrayOld[N-1][j][k];
}
}
for(i=0; i<N; i++){
for(k=0;k<N;k++){
arrayOld[i][0][k] = 100.0;
mean += arrayOld[i][0][k];
}
}
for(i=0; i<N; i++){
for(k=0;k<N;k++){
arrayOld[i][N-1][k] = 0.0;
mean += arrayOld[i][N-1][k];
}
}
mean /= (6.0 * (N*N));
/*Initialize interior values*/
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1; k<N-1;k++){
arrayOld[i][j][k] = mean;
}
}
}
double tdata = omp_get_wtime();
compute_heat_transfer(arrayOld, arrayNew);
tdata = omp_get_wtime()-tdata;
printf("Execution time was %f secs\n", tdata);
for(i=0; i<N;i++){
for(int j=0;j<N;j++){
free(arrayOld[i][j]);
free(arrayNew[i][j]);
}
free(arrayOld[i]);
free(arrayNew[i]);
}
free(arrayOld);
free(arrayNew);
return 0;
}
Use variable length arrays with dynamic storage:
double (*arr)[N][N] = calloc(N, sizeof *arr);
Indexing.
Use good old arr[i][j][k]
syntax
Deallocation.
free(arr)
double *flat = (double*)arr;
Note that this conversion is not guaranteed by the C standard to work. Though it will very likely work on all platforms capable of using GPUs.
void fun(int n, double arr[n][n][n]) {
...
}
Exemplary usage would be:
foo(N, arr);
EDIT
VLA friendly variant of compute_heat_transfer()
:
void compute_heat_transfer(int n, double arrayOld[restrict n][n][n], double arrayNew[restrict n][n][n]) {
int i,j,k;
/*Compute steady-state solution*/
for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
/*if(nsteps % 100 == 0){
writeFile(nsteps, arrayOld);
}*/
#pragma omp parallel for collapse(3)
for(i=1; i<n-1; i++){
for(j=1; j<n-1; j++){
for(k=1; k<n-1; k++){
//This is the 6-neighbor stencil computation
arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
}}}
#pragma omp parallel for collapse(3)
for(i=1; i<n-1; i++){
for(j=1; j<n-1; j++){
for(k=1; k<n-1; k++){
arrayOld[i][j][k] = arrayNew[i][j][k];
}}}
}
}
Keyword restrict
in arrNew[restrict n][n][n]
is used to let the compiler assume that arrNew
and arrOld
do not alias. It should let the compiler use more aggressive optimizations.
Note that arrNew
and arrOld
are pointer to arrays. So rather than copy arrNew
to arrOld
you could simply swap those pointers forming a kind of simple double buffering. It should make the code even faster.