Hi I am trying to compile my application with fatbin flag is active: set_property(TARGET offline PROPERTY CUDA_FATBIN_COMPILATION ON) The error code is:
error : Undefined reference to '_Z7computefff' in 'ltoPtx'
error: nvJitLinkComplete(handle) failed with error 6
error: ERROR 9: finish
When I compiled the code directly using the next command the code run correctly:
nvcc -arch=lto_86 -rdc=true -fatbin offline.cu
When I used Cmake to configure the build the build line is:
Building CUDA object CMakeFiles/offlineLib.dir/offline.fatbin
/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler -std=c++17 "--generate-code=arch=compute_86,code=[compute_86,sm_86]" -MD -MT CMakeFiles/offlineLib.dir/offline.fatbin -MF CMakeFiles/offlineLib.dir/offline.fatbin.d -x cu -fatbin /home/Yehonatans/tmp/jitEx/offline.cu -o CMakeFiles/offlineLib.dir/offline.fatbin
My cmake file is:
cmake_minimum_required(VERSION 3.29)
project(TestJitLto CUDA)
set(CMAKE_CUDA_ARCHITECTURES 86)
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_CUDA_STANDARD 17)
find_package(CUDAToolkit REQUIRED cudadevrt cudart nvJitLink)
# Find the path to nvcc
message(STATUS "nvcc found at: ${CMAKE_CUDA_COMPILER}")
add_executable(TestJitLto online.cu)
set_target_properties(TestJitLto PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(TestJitLto PUBLIC CUDA::nvrtc CUDA::nvJitLink cuda CUDA::cudart)
add_library(offlineLib OBJECT offline.cu )
set_property(TARGET offlineLib PROPERTY CUDA_FATBIN_COMPILATION ON)
Online code
#include <nvrtc.h>
#include <cuda.h>
#include <nvJitLink.h>
#include <nvrtc.h>
#include <iostream>
#define NUM_THREADS 128
#define NUM_BLOCKS 32
#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
exit(1); \
} \
} while(0)
#define CUDA_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
exit(1); \
} \
} while(0)
#define NVJITLINK_SAFE_CALL(h,x) \
do { \
nvJitLinkResult result = x; \
if (result != NVJITLINK_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< result << '\n'; \
size_t lsize; \
result = nvJitLinkGetErrorLogSize(h, &lsize); \
if (result == NVJITLINK_SUCCESS && lsize > 0) { \
char *log = (char*)malloc(lsize); \
result = nvJitLinkGetErrorLog(h, log); \
if (result == NVJITLINK_SUCCESS) { \
std::cerr << "error: " << log << '\n'; \
free(log); \
} \
} \
exit(1); \
} \
} while(0)
const char *lto_saxpy = " \n\
extern __device__ float compute(float a, float x, float y); \n\
\n\
extern \"C\" __global__ \n\
void saxpy(float a, float *x, float *y, float *out, size_t n) \n\
{ \n\
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\
if (tid < n) { \n\
out[tid] = compute(a, x[tid], y[tid]); \n\
} \n\
} \n";
int main(int argc, char *argv[])
{
size_t numBlocks = 32;
size_t numThreads = 128;
// Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog;
NVRTC_SAFE_CALL(
nvrtcCreateProgram(&prog, // prog
lto_saxpy, // buffer
"lto_saxpy.cu", // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
// specify that LTO IR should be generated for LTO operation
const char *opts[] = {"-dlto",
"--relocatable-device-code=true"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
2, // numOptions
opts); // options
// Obtain compilation log from the program.
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char *log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
std::cout << log << '\n';
delete[] log;
if (compileResult != NVRTC_SUCCESS) {
exit(1);
}
// Obtain generated LTO IR from the program.
size_t LTOIRSize;
NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, <OIRSize));
char *LTOIR = new char[LTOIRSize];
NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, LTOIR));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction kernel;
CUDA_SAFE_CALL(cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
// Load the generated LTO IR and the LTO IR generated offline
// and link them together.
nvJitLinkHandle handle;
// Dynamically determine the arch to link for
int major = 0;
int minor = 0;
CUDA_SAFE_CALL(cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
int arch = major*10 + minor;
char smbuf[16];
sprintf(smbuf, "-arch=sm_%d", arch);
const char *lopts[] = {"-lto", smbuf};
NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts));
// NOTE: assumes "offline.fatbin" is in the current directory
// The fatbinary contains LTO IR generated offline using nvcc
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddFile(handle, NVJITLINK_INPUT_FATBIN,
"offline.fatbin"));
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
(void *)LTOIR, LTOIRSize, "lto_online"));
// The call to nvJitLinkComplete causes linker to link together the two
// LTO IR modules (offline and online), do optimization on the linked LTO IR,
// and generate cubin from it.
NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
size_t cubinSize;
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
void *cubin = malloc(cubinSize);
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin));
CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy"));
// Generate input for execution, and create output buffers.
size_t n = NUM_THREADS * NUM_BLOCKS;
size_t bufferSize = n * sizeof(float);
float a = 5.1f;
float *hX = new float[n], *hY = new float[n], *hOut = new float[n];
for (size_t i = 0; i < n; ++i) {
hX[i] = static_cast<float>(i);
hY[i] = static_cast<float>(i * 2);
}
CUdeviceptr dX, dY, dOut;
CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize));
// Execute SAXPY.
void *args[] = { &a, &dX, &dY, &dOut, &n };
CUDA_SAFE_CALL(
cuLaunchKernel(kernel,
NUM_BLOCKS, 1, 1, // grid dim
NUM_THREADS, 1, 1, // block dim
0, NULL, // shared mem and stream
args, 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize());
// Retrieve and print output.
CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize));
for (size_t i = 0; i < n; ++i) {
std::cout << a << " * " << hX[i] << " + " << hY[i]
<< " = " << hOut[i] << '\n';
}
// Release resources.
CUDA_SAFE_CALL(cuMemFree(dX));
CUDA_SAFE_CALL(cuMemFree(dY));
CUDA_SAFE_CALL(cuMemFree(dOut));
CUDA_SAFE_CALL(cuModuleUnload(module));
CUDA_SAFE_CALL(cuCtxDestroy(context));
free(cubin);
delete[] hX;
delete[] hY;
delete[] hOut;
delete[] LTOIR;
return 0;
}
offline code
__device__ float compute(float a, float x, float y) {
return a * x + y;
}
What am I doing wrong and how can I overcome it?
Offline file needs to compiled with separable compilation (-rdc=true
). Not doing so will result in the function being removed, as the compiler thinks it is unused. CUDA_SEPARABLE_COMPILATION
can be used to enable separate compilation in CMake.
I tried using CUDA_SEPARABLE_COMPILATION
with your example and it works:
$ tail -n3 CMakeLists.txt
set_target_properties(offlineLib PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_FATBIN_COMPILATION ON)
$ (cd ./CMakeFiles/offlineLib.dir/ && compute-sanitizer ../../TestJitLto) | tail
5.1 * 4087 + 8174 = 29017.7
5.1 * 4088 + 8176 = 29024.8
5.1 * 4089 + 8178 = 29031.9
5.1 * 4090 + 8180 = 29039
5.1 * 4091 + 8182 = 29046.1
5.1 * 4092 + 8184 = 29053.2
5.1 * 4093 + 8186 = 29060.3
5.1 * 4094 + 8188 = 29067.4
5.1 * 4095 + 8190 = 29074.5
========= ERROR SUMMARY: 0 errors