The following codes add two vectors using boost.compute and opencl c++ wrapper respectively. The result shows boost.compute is almost 20 times slower than the opencl c++ wrapper. I wonder if I miss use boost.compute or it is indeed slow. Platform: win7, vs2013, boost 1.55, boost.compute 0.2, ATI Radeon HD 4600
Code uses the c++ wrapper:
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <boost/timer/timer.hpp>
#include <boost/smart_ptr/scoped_array.hpp>
#include <fstream>
#include <numeric>
#include <algorithm>
#include <functional>
int main(){
static char kernelSourceCode[] = "\
__kernel void vadd(__global int * a, __global int * b, __global int * c){\
size_t i = get_global_id(0);\
\
c[i] = a[i] + b[i];\
}\
";
using type = boost::scoped_array<int>;
size_t const BUFFER_SIZE = 1UL << 13;
type A(new int[BUFFER_SIZE]);
type B(new int[BUFFER_SIZE]);
type C(new int[BUFFER_SIZE]);
std::iota(A.get(), A.get() + BUFFER_SIZE, 0);
std::transform(A.get(), A.get() + BUFFER_SIZE, B.get(), std::bind(std::multiplies<int>(), std::placeholders::_1, 2));
try {
std::vector<cl::Platform> platformList;
// Pick platform
cl::Platform::get(&platformList);
// Pick first platform
cl_context_properties cprops[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platformList[0])(),
0
};
cl::Context context(CL_DEVICE_TYPE_GPU, cprops);
// Query the set of devices attached to the context
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create command-queue
cl::CommandQueue queue(context, devices[0], 0);
// Create the program from source
cl::Program::Sources sources(
1,
std::make_pair(kernelSourceCode, 0)
);
cl::Program program(context, sources);
// Build program
program.build(devices);
// Create buffer for A and copy host contents
cl::Buffer aBuffer = cl::Buffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
BUFFER_SIZE * sizeof(int),
(void *)&A[0]);
// Create buffer for B and copy host contents
cl::Buffer bBuffer = cl::Buffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
BUFFER_SIZE * sizeof(int),
(void *)&B[0]);
// Create buffer that uses the host ptr C
cl::Buffer cBuffer = cl::Buffer(
context,
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
BUFFER_SIZE * sizeof(int),
(void *)&C[0]);
// Create kernel object
cl::Kernel kernel(program, "vadd");
// Set kernel args
kernel.setArg(0, aBuffer);
kernel.setArg(1, bBuffer);
kernel.setArg(2, cBuffer);
// Do the work
void *output;
{
boost::timer::auto_cpu_timer timer;
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(BUFFER_SIZE),
cl::NullRange
);
output = (int *)queue.enqueueMapBuffer(
cBuffer,
CL_TRUE, // block
CL_MAP_READ,
0,
BUFFER_SIZE * sizeof(int)
);
}
std::ofstream gpu("gpu.txt");
for (int i = 0; i < BUFFER_SIZE; i++) {
gpu << C[i] << " ";
}
queue.enqueueUnmapMemObject(
cBuffer,
output);
}
catch (cl::Error const &err) {
std::cerr << err.what() << "\n";
}
return EXIT_SUCCESS;
}
Code uses boost.compute:
#include <boost/compute/container/mapped_view.hpp>
#include <boost/compute/algorithm/transform.hpp>
#include <boost/compute/functional/operator.hpp>
#include <numeric>
#include <algorithm>
#include <functional>
#include <boost/timer/timer.hpp>
#include <boost/smart_ptr/scoped_array.hpp>
#include <fstream>
#include <boost/tuple/tuple_comparison.hpp>
int main(){
size_t const BUFFER_SIZE = 1UL << 13;
boost::scoped_array<int> A(new int[BUFFER_SIZE]), B(new int[BUFFER_SIZE]), C(new int[BUFFER_SIZE]);
std::iota(A.get(), A.get() + BUFFER_SIZE, 0);
std::transform(A.get(), A.get() + BUFFER_SIZE, B.get(), std::bind(std::multiplies<int>(), std::placeholders::_1, 2));
try{
if (boost::compute::system::default_device().type() != CL_DEVICE_TYPE_GPU){
std::cerr << "Not GPU\n";
}
else{
boost::compute::command_queue queue = boost::compute::system::default_queue();
boost::compute::mapped_view<int> mA(static_cast<const int*>(A.get()), BUFFER_SIZE),
mB(static_cast<const int*>(B.get()), BUFFER_SIZE);
boost::compute::mapped_view<int> mC(C.get(), BUFFER_SIZE);
{
boost::timer::auto_cpu_timer timer;
boost::compute::transform(
mA.cbegin(), mA.cend(),
mB.cbegin(),
mC.begin(),
boost::compute::plus<int>(),
queue
);
mC.map(CL_MAP_READ, queue);
}
std::ofstream gpu("gpu.txt");
for (size_t i = 0; i != BUFFER_SIZE; ++i) gpu << C[i] << " ";
mC.unmap(queue);
}
}
catch (boost::compute::opencl_error const &err){
std::cerr << err.what() << "\n";
}
return EXIT_SUCCESS;
}
The kernel code generated by the transform()
function in Boost.Compute should be almost identical to the kernel code you use in the C++ wrapper version (though Boost.Compute will do some unrolling).
The reason you see a difference in timings is that in the first version you are only measuring the time it takes to enqueue the kernel and map the results back to the host. In the Boost.Compute version you are also measuring the amount of time it takes to create the transform()
kernel, compile it, and then execute it. If you want a more realistic comparison you should measure the total execution time for the first example including the time it takes to set up and compile the OpenCL program.
This initialization penalty (which is inherent in OpenCL's run-time compilation model) is somewhat mitigated in Boost.Compute by automatically caching compiled kernels during run-time (and also optionally caching them offline for reuse the next time the program is run). Calling transform()
multiple times will be much faster after the first invocation.
P.S. You can also just use the core wrapper classes in Boost.Compute (like device
and context
) along with the container classes (like vector<T>
) and still run your own custom kernels.