Struggling with CUDA, Clang and LLVM IR, and getting: CUDA failure: 'Invalid device function'

I am trying to optimize a CUDA code with LLVM passes on a PowerPC system (RHEL 7.6 with no root access) equipped with V100 GPUs, CUDA 10.1, and LLVM 11 (built from source). Also, I tested clang, lli, and opt on a simple C++ code, and everything works just fine.

After days of searching, reading, and trials-and-errors, I managed to compile a simple CUDA source. The code is the famous axpy:

#include <iostream>

#define cudaCheckError()                                       \
  {                                                            \
    cudaError_t e = cudaGetLastError();                        \
    if (e != cudaSuccess) {                                    \
      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
             cudaGetErrorString(e));                           \
      exit(EXIT_FAILURE);                                      \
    }                                                          \
  }

__global__ void axpy(float a, float* x, float* y) {
  y[threadIdx.x] = a * x[threadIdx.x];
}

int main(int argc, char* argv[]) {
  const int kDataLen = 4;

  float a = 2.0f;
  float host_x[kDataLen] = {1.0f, 2.0f, 3.0f, 4.0f};
  float host_y[kDataLen];

  // Copy input data to device.
  float* device_x;
  float* device_y;
  cudaMalloc(&device_x, kDataLen * sizeof(float));
  cudaMalloc(&device_y, kDataLen * sizeof(float));
  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float),
             cudaMemcpyHostToDevice);
  
  // Launch the kernel.
  axpy<<<1, kDataLen>>>(a, device_x, device_y);
  cudaCheckError();
  
  // Copy output data to host.
  cudaDeviceSynchronize();
  cudaMemcpy(host_y, device_y, kDataLen * sizeof(float),
             cudaMemcpyDeviceToHost);
 
  // Print the results.
  for (int i = 0; i < kDataLen; ++i) {
    std::cout << "y[" << i << "] = " << host_y[i] << "\n";
  }

  cudaDeviceReset();
  return 0;
}

And I wrote the compilation steps in a Makefile like this (I know, its style can be improved!): I haven't added any LLVM passes yet.

BIN_FILE=axpy
SRC_FILE=$(BIN_FILE).cu

main: $(BIN_FILE)

$(BIN_FILE).ll: $(SRC_FILE)
    clang++ -stdlib=libc++ -Wall $(BIN_FILE).cu --cuda-gpu-arch=sm_70 -S -c -emit-llvm

$(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll: $(SRC_FILE)
    clang++ -stdlib=libc++ -Wall $(BIN_FILE).cu --cuda-gpu-arch=sm_70 -S -c -emit-llvm

$(BIN_FILE).ptx: $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll
    llc -march=nvptx64 $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll -o $(BIN_FILE).ptx

$(BIN_FILE)_dlink.o: $(BIN_FILE).ptx
    ptxas -m64 --gpu-name=sm_70 $(BIN_FILE).ptx -o $(BIN_FILE).ptx.o
    fatbinary --64 --create $(BIN_FILE).fatbin --image=profile=sm_70,file=$(BIN_FILE).ptx.o
    nvcc $(BIN_FILE).fatbin -arch=sm_70 -dlink -o $(BIN_FILE)_dlink.o -rdc=true

# For the host code:
$(BIN_FILE).o: $(BIN_FILE).ll
    llc -mcpu=ppc64 $(BIN_FILE).ll -o $(BIN_FILE).s
    clang++ -c $(BIN_FILE).s -o $(BIN_FILE).o

# Link both object files together with a linker:
$(BIN_FILE): $(BIN_FILE).o $(BIN_FILE)_dlink.o
    nvcc $(BIN_FILE).o $(BIN_FILE)_dlink.o -o $(BIN_FILE) -arch=sm_70 -lc++

clean:
    rm *.ll *.s *.ptx *.ptx.o *.fatbin $(BIN_FILE) $(BIN_FILE).o $(BIN_FILE)_dlink.o

It seems all the steps run smoothly without any warning, but after running the executable file, I get the error: Cuda failure axpy.cu:33: 'invalid device function'

I have also replaced the last linker command with the following, and it runs ok, but with the same error.

clang++ -stdlib=libc++ $(BIN_FILE).o $(BIN_FILE)_dlink.o -o $(BIN_FILE) -lcuda -lcudart -lcudadevrt \
        -L/path-to-gcc-lib/

I really appreciate any help. It's worth mentioning that I need to utilize some of the modern features of CUDA, most of all cooperative groups, so I thought recent LLVM releases might be working. Wondering whether changing LLVM version might be helpful.

-- Edit:

The output of cuobjdump axpy.fatbin -ptx -sass:

Fatbin elf code:
================
arch = sm_70
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

code for sm_70
        Function : _Z4axpyfPfS_
.headerflags    @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
   /*0000*/           MOV R1, c[0x0][0x28] ;                       /* 0x00000a0000017a02 */
                                                                   /* 0x000fd00000000f00 */
   /*0010*/      @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;                /* 0x000000fffffff389 */
                                                                   /* 0x000fe200000e00ff */
   /*0020*/           IADD3 R1, R1, -0x18, RZ ;                    /* 0xffffffe801017810 */
                                                                   /* 0x000fe20007ffe0ff */
   /*0030*/           IMAD.MOV.U32 R9, RZ, RZ, c[0x0][0x16c] ;     /* 0x00005b00ff097624 */
                                                                   /* 0x000fe200078e00ff */
   /*0040*/           MOV R8, c[0x0][0x168] ;                      /* 0x00005a0000087a02 */
                                                                   /* 0x000fe40000000f00 */
   /*0050*/           IADD3 R2, P0, R1, c[0x0][0x20], RZ ;         /* 0x0000080001027a10 */
                                                                   /* 0x000fc80007f1e0ff */
   /*0060*/           IADD3.X R3, RZ, c[0x0][0x24], RZ, P0, !PT ;  /* 0x00000900ff037a10 */
                                                                   /* 0x000fd000007fe4ff */
   /*0070*/           ST.E.64.SYS [R2+0x8], R8 ;                   /* 0x0000000802007385 */
                                                                   /* 0x0001e8000010eb08 */
   /*0080*/           LD.E.64.SYS R4, [R2+0x8] ;                   /* 0x0000000802047980 */
                                                                   /* 0x000ea2000010eb00 */
   /*0090*/           IMAD.MOV.U32 R10, RZ, RZ, c[0x0][0x170] ;    /* 0x00005c00ff0a7624 */
                                                                   /* 0x000fe200078e00ff */
   /*00a0*/           MOV R11, c[0x0][0x174] ;                     /* 0x00005d00000b7a02 */
                                                                   /* 0x000fe20000000f00 */
   /*00b0*/           IMAD.MOV.U32 R15, RZ, RZ, c[0x0][0x160] ;    /* 0x00005800ff0f7624 */
                                                                   /* 0x000fe200078e00ff */
   /*00c0*/           S2R R13, SR_TID.X ;                          /* 0x00000000000d7919 */
                                                                   /* 0x000eac0000002100 */
   /*00d0*/           ST.E.64.SYS [R2+0x10], R10 ;                 /* 0x0000001002007385 */
                                                                   /* 0x000fe8000010eb0a */
   /*00e0*/           LD.E.64.SYS R6, [R2+0x10] ;                  /* 0x0000001002067980 */
                                                                   /* 0x000ee8000010eb00 */
   /*00f0*/           ST.E.SYS [R2], R15 ;                         /* 0x0000000002007385 */
                                                                   /* 0x000fe8000010e90f */
   /*0100*/           LD.E.SYS R0, [R2] ;                          /* 0x0000000002007980 */
                                                                   /* 0x000e22000010e900 */
   /*0110*/           IMAD.WIDE.U32 R4, R13, 0x4, R4 ;             /* 0x000000040d047825 */
                                                                   /* 0x004fd400078e0004 */
   /*0120*/           LD.E.SYS R5, [R4] ;                          /* 0x0000000004057980 */
                                                                   /* 0x000e22000010e900 */
   /*0130*/           IMAD.WIDE.U32 R6, R13, 0x4, R6 ;             /* 0x000000040d067825 */
                                                                   /* 0x008fe400078e0006 */
   /*0140*/           FMUL R9, R0, R5 ;                            /* 0x0000000500097220 */
                                                                   /* 0x001fd00000400000 */
   /*0150*/           ST.E.SYS [R6], R9 ;                          /* 0x0000000006007385 */
                                                                   /* 0x000fe2000010e909 */
   /*0160*/           EXIT ;                                       /* 0x000000000000794d */
                                                                   /* 0x000fea0003800000 */
   /*0170*/           BRA 0x170;                                   /* 0xfffffff000007947 */
                                                                   /* 0x000fc0000383ffff */
    .......................

The output ends with dots.

Solution

The problem was not related to PowerPC architecture. I needed to pass the fatbin file to the host-side compilation command with -Xclang -fcuda-include-gpubinary -Xclang axpy.fatbin to replicate the whole compilation behavior.

Here is the corrected Makefile:

BIN_FILE=axpy
SRC_FILE=$(BIN_FILE).cu

main: $(BIN_FILE)

# Host Side
$(BIN_FILE).ll: $(SRC_FILE) $(BIN_FILE).fatbin
    clang++ -stdlib=libc++ -Wall -Werror $(BIN_FILE).cu -march=ppc64le --cuda-host-only -relocatable-pch \
        -Xclang -fcuda-include-gpubinary -Xclang $(BIN_FILE).fatbin -S -g -c -emit-llvm

$(BIN_FILE).o: $(BIN_FILE).ll
    llc -march=ppc64le $(BIN_FILE).ll -o $(BIN_FILE).s
    clang++ -c -Wall $(BIN_FILE).s -o $(BIN_FILE).o

# GPU Side
$(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll: $(SRC_FILE)
    clang++ -x cuda -stdlib=libc++ -Wall -Werror $(BIN_FILE).cu --cuda-device-only \
        --cuda-gpu-arch=sm_70 -S -g -emit-llvm

$(BIN_FILE).ptx: $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll
    llc -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll -o $(BIN_FILE).ptx

$(BIN_FILE).ptx.o: $(BIN_FILE).ptx
    ptxas -m64 --gpu-name=sm_70 $(BIN_FILE).ptx -o $(BIN_FILE).ptx.o

$(BIN_FILE).fatbin: $(BIN_FILE).ptx.o
    fatbinary --64 --create $(BIN_FILE).fatbin --image=profile=sm_70,file=$(BIN_FILE).ptx.o \
        --image=profile=compute_70,file=$(BIN_FILE).ptx -link

$(BIN_FILE)_dlink.o: $(BIN_FILE).fatbin
    nvcc $(BIN_FILE).fatbin -gencode arch=compute_70,code=sm_70 \
        -dlink -o $(BIN_FILE)_dlink.o -lcudart -lcudart_static -lcudadevrt

# Link both object files together (either nvcc or clang works here):
$(BIN_FILE): $(BIN_FILE).o $(BIN_FILE)_dlink.o
    nvcc $(BIN_FILE).o $(BIN_FILE)_dlink.o -o $(BIN_FILE) -arch=sm_70 -lc++

Figure 1 in this link includes the creation steps of the fatbinary file.