During implementation of CUDA code I often need some utility functions, which will be called from device and also from host code. So I declare these functions as __host__ __device__. This is OK and possible device/host incompabilities can be handled by #ifdef CUDA_ARCH.
Problems come when the utility function is templated ie. by some functor type. If the template instance calls a __host__ function I get this warning:
calling a __host__ function from a __host__ __device__ function is not allowed
detected during instantiation of "int foo(const T &) [with T=HostObject]"
Only solution I know is to define the function twice - once for device and once for host code with different name (I cannot overload on __host__ __device__
). But this means that there is code duplication and all other __host__ __device__
functions which will call it, must be also defined twice (even more code duplication).
Simplified example:
#include <cuda.h>
#include <iostream>
struct HostObject {
__host__
int value() const { return 42; }
};
struct DeviceObject {
__device__
int value() const { return 3; }
};
template <typename T>
__host__ __device__
int foo(const T &obj) {
return obj.value();
}
/*
template <typename T>
__host__
int foo_host(const T &obj) {
return obj.value();
}
template <typename T>
__device__
int foo_device(const T &obj) {
return obj.value();
}
*/
__global__ void kernel(int *data) {
data[threadIdx.x] = foo(DeviceObject());
}
int main() {
foo(HostObject());
int *data;
cudaMalloc((void**)&data, sizeof(int) * 64);
kernel<<<1, 64>>>(data);
cudaThreadSynchronize();
cudaFree(data);
}
Warning is caused by the foo(HostObject());
call inside the main()
function.
foo_host<>
and foo_device<>
are possible replacements for the problematic foo<>
.
Is there a better solution? Can I prevent instantion of foo()
on the device side?
You cannot prevent instantiation of either half of a __host__ __device__
function template instantiation. If you instantiate the function by calling it on the host (device), the compiler will also instantiate the device (host) half.
The best you can do for your use case as of CUDA 7.0 is to suppress the warning using #pragma hd_warning_disable
as in the following example and ensure that the function is not called incorrectly.
#include <iostream>
#include <cstdio>
#pragma hd_warning_disable
template<class Function>
__host__ __device__
void invoke(Function f)
{
f();
}
struct host_only
{
__host__
void operator()()
{
std::cout << "host_only()" << std::endl;
}
};
struct device_only
{
__device__
void operator()()
{
printf("device_only(): thread %d\n", threadIdx.x);
}
};
__global__
void kernel()
{
// use from device with device functor
invoke(device_only());
// XXX error
// invoke(host_only());
}
int main()
{
// use from host with host functor
invoke(host_only());
kernel<<<1,1>>>();
cudaDeviceSynchronize();
// XXX error
// invoke(device_only());
return 0;
}