I'm writing a CUDA kernel that is compiled at runtime using NVRTC (CUDA version 9.2 with NVRTC version 7.5), which needs the stdint.h
header, in order to have the int32_t
etc. types.
If I write the kernel source code without the include, it works correctly. For example the kernel
extern "C" __global__ void f() { ... }
Compiles to PTX code where f is defined as .visible .entry f
.
But if the kernel source code is
#include <stdint.h>
extern "C" __global__ void f() { ... }
it reports A function without execution space annotations (__host__/__device__/__global__) is considered a host function, and host functions are not allowed in JIT mode.
(also without extern "C"
).
Passing -default-device
makes the PTX code .visible .func f
, so the function cannot be called from the host.
Is there a way to include headers in the source code, and still have a __global__
entry function? Or alternately, a way to know which integer size convention is used on the by the NVRTC compiler, so that the int32_t
etc. types can be manually defined?
Edit: Example program that shows the problem:
#include <cstdlib>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>
[[noreturn]] void fail(const std::string& msg, int code) {
std::cerr << "error: " << msg << " (" << code << ')' << std::endl;
std::exit(EXIT_FAILURE);
}
std::unique_ptr<char[]> compile_to_ptx(const char* program_source) {
nvrtcResult rv;
// create nvrtc program
nvrtcProgram prog;
rv = nvrtcCreateProgram(
&prog,
program_source,
"program.cu",
0,
nullptr,
nullptr
);
if(rv != NVRTC_SUCCESS) fail("nvrtcCreateProgram", rv);
// compile nvrtc program
std::vector<const char*> options = {
"--gpu-architecture=compute_30"
};
//options.push_back("-default-device");
rv = nvrtcCompileProgram(prog, options.size(), options.data());
if(rv != NVRTC_SUCCESS) {
std::size_t log_size;
rv = nvrtcGetProgramLogSize(prog, &log_size);
if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLogSize", rv);
auto log = std::make_unique<char[]>(log_size);
rv = nvrtcGetProgramLog(prog, log.get());
if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLog", rv);
assert(log[log_size - 1] == '\0');
std::cerr << "Compile error; log:\n" << log.get() << std::endl;
fail("nvrtcCompileProgram", rv);
}
// get ptx code
std::size_t ptx_size;
rv = nvrtcGetPTXSize(prog, &ptx_size);
if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTXSize", rv);
auto ptx = std::make_unique<char[]>(ptx_size);
rv = nvrtcGetPTX(prog, ptx.get());
if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTX", rv);
assert(ptx[ptx_size - 1] == '\0');
nvrtcDestroyProgram(&prog);
return ptx;
}
const char program_source[] = R"%%%(
//#include <stdint.h>
extern "C" __global__ void f(int* in, int* out) {
out[threadIdx.x] = in[threadIdx.x];
}
)%%%";
int main() {
CUresult rv;
// initialize CUDA
rv = cuInit(0);
if(rv != CUDA_SUCCESS) fail("cuInit", rv);
// compile program to ptx
auto ptx = compile_to_ptx(program_source);
std::cout << "PTX code:\n" << ptx.get() << std::endl;
}
When //#include <stdint.h>
in the kernel source is uncommented it no longer compiles. When //options.push_back("-default-device");
is uncommented it compiles but does not mark the function f
as .entry
.
CMakeLists.txt to compile it (needs CUDA driver API + NVRTC)
cmake_minimum_required(VERSION 3.4)
project(cudabug CXX)
find_package(CUDA REQUIRED)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED 14)
add_executable(cudabug cudabug.cc)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
target_link_libraries(cudabug PUBLIC ${CUDA_LIBRARIES} nvrtc cuda)
[Preface: this is a very hacky answer, and is specific to the GNU toolchain (although I suspect the problem in the question is also specific to the GNU toolchain)].
It would appear that the problem here is with the GNU standard header features.h
, which gets pulled into stdint.h
and then winds up defining a lot of stub functions which have the default __host__
compilation space. This causes nvrtc to blow up. It also seems that the -default-device
option will result in a resolved glibC compiler feature set which makes the whole nvrtc compiler fail.
You can defeat this (in a very hacky way) by predefining a feature set for the standard library which excludes all the host functions. Changing your JIT kernel code to
const char program_source[] = R"%%%(
#define __ASSEMBLER__
#define __extension__
#include <stdint.h>
extern "C" __global__ void f(int32_t* in, int32_t* out) {
out[threadIdx.x] = in[threadIdx.x];
}
)%%%";
got me this:
$ nvcc -std=c++14 -ccbin=g++-7 jit_header.cu -o jitheader -lnvrtc -lcuda
$ ./jitheader
PTX code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-24330188
// Cuda compilation tools, release 9.2, V9.2.148
// Based on LLVM 3.4svn
//
.version 6.2
.target sm_30
.address_size 64
// .globl f
.visible .entry f(
.param .u64 f_param_0,
.param .u64 f_param_1
)
{
.reg .b32 %r<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [f_param_0];
ld.param.u64 %rd2, [f_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.u32 %r2, [%rd6];
add.s64 %rd7, %rd3, %rd5;
st.global.u32 [%rd7], %r2;
ret;
}
Big caveat: This worked on the glibC system I tried it on. It probably won't work with other toolchains or libC implementations (if, indeed, they have this problem).