I am working on a project which performs extensive matrix operations and takes advantage of GPU offloading via OpenMP. Things work well for most of the code, except when I try to run some target-declared function from within a target region.
I created an example that computes the Gauss integral between -10.0 and 10.0, squares the result and prints it to output. The correct answer is approximately PI and the idea is to calculate it first with explicit code, then by calling a target declared function. My code is:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#pragma omp begin declare target
double gauss(double* d_array, size_t size) {
double d_result = 0.0;
const double dx = 20.0 / (1.0 * size);
#pragma omp teams distribute parallel for simd reduction(+:d_result)
for (size_t i = 0; i < size; i++) {
double x = 10.0 * i / (0.5 * size) - 10.0;
d_array[i] = exp(-1.0 * x * x) * dx;
d_result += d_array[i];
}
return d_result;
}
#pragma omp end declare target
int main(int argc, char *argv[]) {
double t_start = 0.0, t_end = 0.0;
double result = 0.0;
size_t gb = 1024 * 1024 * 1024;
size_t size = 2;
size_t elements = gb / sizeof(double) * size;
double *array = (double*)malloc(elements * sizeof(double));
// Testing inline: this works fine
printf("Running offloaded calculation from main()...\n");
t_start = omp_get_wtime();
#pragma omp target map(tofrom:result) map(to:elements) map(alloc:array[0:elements])
{
const double dx = 20.0 / (1.0 * elements);
#pragma omp teams distribute parallel for simd reduction(+:result)
for (int64_t i = 0; i < elements; i++) {
double x = 10.0 * i / (0.5 * elements) - 10.0;
array[i] = exp(-1.0 * x * x) * dx;
result += array[i];
}
}
t_end = omp_get_wtime();
result *= result;
printf("The result of the inline test is %lf.\n", result);
printf("Inline calculation lasted %lfs.\n", (t_end - t_start));
// Testing target-declared function: something's odd here
printf("Running offloaded calculation from gauss()...\n");
t_start = omp_get_wtime();
#pragma omp target map(tofrom:result) map(to:elements) map(alloc:array[0:elements])
{
result = gauss(array, elements);
}
t_end = omp_get_wtime();
result *= result;
printf("The result of the test using gauss() is %lf.\n", result);
printf("The calculation using gauss() lasted %lfs.\n", (t_end - t_start));
free(array);
return 0;
}
If I compile and run it with NVIDIA, I get:
$ nvcc -O3 gauss_test.c -o nv_gauss_test -lgomp
$ OMP_TARGET_OFFLOAD=MANDATORY ./nv_gauss_test
Running offloaded calculation from main()...
The result of the inline test is 3.141593.
Inline calculation lasted 1.867287s.
Running offloaded calculation from gauss()...
The result of the test using gauss() is 3.141593.
The calculation using gauss() lasted 1.427201s.
Conversely, if I compile and run it with GCC (EDIT: including the -foffload flag suggested by Jérome below), I get:
$ gcc -O3 -fopenmp -no-pie -foffload=default -fcf-protection=none -fno-stack-protector -foffload-options="-O3 -fcf-protection=none -fno-stack-protector -lm -latomic -lgomp" gauss_test.c -o gnu_gauss_test -lm -lgomp
$ OMP_TARGET_OFFLOAD=MANDATORY ./gnu_gauss_test
Running offloaded calculation from main()...
The result of the inline test is 3.141593.
Inline calculation lasted 2.125599s.
Running offloaded calculation from gauss()...
The result of the test using gauss() is 0.000000.
The calculation using gauss() lasted 0.002109s.
i.e., the inline part works fine, but the gauss() function call (which worked with NVIDIA and works if I disable the offload) does not produce the desired result.
My guess is that I messed up something with GNU compiler flags (my hardware does not support full stack protection on device, so I disabled it). I also know that OpenMP has a default data mapping policy that works with the NVIDIA compiler, but does not work as straightforwardly with the GNU compiler (this is the reason why I explicitly mapped some communications).
My project should in principle target general hardware, so using just nvcc is not the ideal solution. Does anyone have ideas?
Thank you very much!
PS: I am using nvcc V12.0.140 and gcc 14.2.0.
A few fixes applied to the code.
The code is not a valid OpenMP program, as it violates this restriction:
If a teams region is nested inside a target region, the corresponding target construct must not contain any statements, declarations or directives outside of the corresponding teams construct.
Therefore, the calculation of dx was moved out of the target region.
Furthermore, a first target region checks that the code is actually executed on a GPU and eats all possible intialization cost to get a more comparable time measurement for the two kernels.
Using a GPU, which can actually do FP64 in hardware, I get similar performance for GCC and Clang:
Running offloaded calculation from main()... The result of the inline test is 3.141593. Inline calculation lasted 0.007591s. Running offloaded calculation from gauss()... The result of the test using gauss() is 3.141593. The calculation using gauss() lasted 0.008335s.
clang -fopenmp -fopenmp-targets=nvptx64 --cuda-gpu-arch=sm_90 -O3 gauss_test.c -o gnu_gauss_test_clang
gcc -O3 -fopenmp -no-pie -foffload=nvptx-none -fcf-protection=none -fno-stack-protector -foffload-options="-O3 -fcf-protection=none -fno-stack-protector -lm -latomic -lgomp" -lm gauss_test.c -o gnu_gauss_test_gcc
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
// This is a device-specific function: it is called
// on the device and its return value is used by the
// device.
#pragma omp begin declare target
double increment(double x_val, double diff_val) {
return (exp(-1.0 * x_val * x_val) * diff_val);
}
#pragma omp end declare target
// This is a wrapper function: it is called by the host
// and it opens a target region where the device function
// is executed. The return value of the device function is
// used to build the return value of the wrapper.
double gauss(double* d_array, size_t size) {
double d_result = 0.0;
const double dx = 20.0 / (1.0 * size);
#pragma omp target map(tofrom: d_result) firstprivate(dx, size) map(alloc:d_array[0:size])
{
#pragma omp teams distribute parallel for simd reduction(+:d_result)
for (size_t i = 0; i < size; i++) {
double x = 10.0 * i / (0.5 * size) - 10.0;
// Here I call the target-declared function increment()
// to get the integration element that I want to sum.
d_array[i] = increment(x, dx);
d_result += d_array[i];
}
}
return d_result;
}
int main(int argc, char *argv[]) {
double t_start = 0.0, t_end = 0.0;
double result = 0.0;
size_t gb = 1024 * 1024 * 1024;
size_t size = 2;
size_t elements = gb / sizeof(double) * size;
double *array = (double*)malloc(elements * sizeof(double));
#pragma omp target
printf("Initial device: %i\n", omp_is_initial_device());
// Testing inline: this works fine
printf("Running offloaded calculation from main()...\n");
t_start = omp_get_wtime();
const double dx = 20.0 / (1.0 * elements);
#pragma omp target map(tofrom:result) map(to:elements) map(alloc:array[0:elements]) firstprivate(dx)
{
#pragma omp teams distribute parallel for simd reduction(+:result)
for (int64_t i = 0; i < elements; i++) {
double x = 10.0 * i / (0.5 * elements) - 10.0;
array[i] = exp(-1.0 * x * x) * dx;
result += array[i];
}
}
t_end = omp_get_wtime();
result *= result;
printf("The result of the inline test is %lf.\n", result);
printf("Inline calculation lasted %lfs.\n", (t_end - t_start));
// Testing wrapper function with target-declared calls: also works
printf("Running offloaded calculation from gauss()...\n");
t_start = omp_get_wtime();
result = gauss(array, elements);
t_end = omp_get_wtime();
result *= result;
printf("The result of the test using gauss() is %lf.\n", result);
printf("The calculation using gauss() lasted %lfs.\n", (t_end - t_start));
free(array);
return 0;
}