c++boostopenclboost-compute

Performance: boost.compute v.s. opencl c++ wrapper


The following codes add two vectors using boost.compute and opencl c++ wrapper respectively. The result shows boost.compute is almost 20 times slower than the opencl c++ wrapper. I wonder if I miss use boost.compute or it is indeed slow. Platform: win7, vs2013, boost 1.55, boost.compute 0.2, ATI Radeon HD 4600

Code uses the c++ wrapper:

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <boost/timer/timer.hpp>
#include <boost/smart_ptr/scoped_array.hpp>
#include <fstream>
#include <numeric>
#include <algorithm>
#include <functional>

int main(){
    static char kernelSourceCode[] = "\
__kernel void vadd(__global int * a, __global int * b, __global int * c){\
    size_t i = get_global_id(0);\
    \
    c[i] = a[i] + b[i];\
    }\
";

    using type = boost::scoped_array<int>;
    size_t const BUFFER_SIZE = 1UL << 13;
    type A(new int[BUFFER_SIZE]);
    type B(new int[BUFFER_SIZE]);
    type C(new int[BUFFER_SIZE]);

    std::iota(A.get(), A.get() + BUFFER_SIZE, 0);
    std::transform(A.get(), A.get() + BUFFER_SIZE, B.get(), std::bind(std::multiplies<int>(), std::placeholders::_1, 2));

    try {
        std::vector<cl::Platform> platformList;
        // Pick platform
        cl::Platform::get(&platformList);
        // Pick first platform
        cl_context_properties cprops[] = {
            CL_CONTEXT_PLATFORM,
            (cl_context_properties)(platformList[0])(),
            0
        };
        cl::Context context(CL_DEVICE_TYPE_GPU, cprops);
        // Query the set of devices attached to the context
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
        // Create command-queue
        cl::CommandQueue queue(context, devices[0], 0);
        // Create the program from source
        cl::Program::Sources sources(
            1,
            std::make_pair(kernelSourceCode, 0)
            );
        cl::Program program(context, sources);
        // Build program
        program.build(devices);
        // Create buffer for A and copy host contents
        cl::Buffer aBuffer = cl::Buffer(
            context,
            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
            BUFFER_SIZE * sizeof(int),
            (void *)&A[0]);
        // Create buffer for B and copy host contents
        cl::Buffer bBuffer = cl::Buffer(
            context,
            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
            BUFFER_SIZE * sizeof(int),
            (void *)&B[0]);
        // Create buffer that uses the host ptr C
        cl::Buffer cBuffer = cl::Buffer(
            context,
            CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
            BUFFER_SIZE * sizeof(int),
            (void *)&C[0]);
        // Create kernel object
        cl::Kernel kernel(program, "vadd");
        // Set kernel args
        kernel.setArg(0, aBuffer);
        kernel.setArg(1, bBuffer);
        kernel.setArg(2, cBuffer);
        // Do the work
        void *output;
        {
            boost::timer::auto_cpu_timer timer;
            queue.enqueueNDRangeKernel(
                kernel,
                cl::NullRange,
                cl::NDRange(BUFFER_SIZE),
                cl::NullRange
                );
            output = (int *)queue.enqueueMapBuffer(
                cBuffer,
                CL_TRUE, // block
                CL_MAP_READ,
                0,
                BUFFER_SIZE * sizeof(int)
                );
        }
        std::ofstream gpu("gpu.txt");
        for (int i = 0; i < BUFFER_SIZE; i++) {
            gpu << C[i] << " ";
        }
        queue.enqueueUnmapMemObject(
            cBuffer,
            output);
    }
    catch (cl::Error const &err) {
        std::cerr << err.what() << "\n";
    }

    return EXIT_SUCCESS;
}

Code uses boost.compute:

#include <boost/compute/container/mapped_view.hpp>
 #include <boost/compute/algorithm/transform.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <numeric>
 #include <algorithm>
 #include <functional>
 #include <boost/timer/timer.hpp>
 #include <boost/smart_ptr/scoped_array.hpp>
 #include <fstream>
 #include <boost/tuple/tuple_comparison.hpp>

 int main(){
     size_t const BUFFER_SIZE = 1UL << 13;
     boost::scoped_array<int> A(new int[BUFFER_SIZE]), B(new int[BUFFER_SIZE]), C(new int[BUFFER_SIZE]);

     std::iota(A.get(), A.get() + BUFFER_SIZE, 0);
     std::transform(A.get(), A.get() + BUFFER_SIZE, B.get(), std::bind(std::multiplies<int>(), std::placeholders::_1, 2));

     try{
         if (boost::compute::system::default_device().type() != CL_DEVICE_TYPE_GPU){
             std::cerr << "Not GPU\n";
         }
         else{
             boost::compute::command_queue queue = boost::compute::system::default_queue();
             boost::compute::mapped_view<int> mA(static_cast<const int*>(A.get()), BUFFER_SIZE),
                 mB(static_cast<const int*>(B.get()), BUFFER_SIZE);
             boost::compute::mapped_view<int> mC(C.get(), BUFFER_SIZE);
             {
                 boost::timer::auto_cpu_timer timer;
                 boost::compute::transform(
                     mA.cbegin(), mA.cend(),
                     mB.cbegin(),
                     mC.begin(),
                     boost::compute::plus<int>(),
                     queue
                     );
                 mC.map(CL_MAP_READ, queue);
             }
             std::ofstream gpu("gpu.txt");
             for (size_t i = 0; i != BUFFER_SIZE; ++i) gpu << C[i] << " ";
             mC.unmap(queue);
         }
     }
     catch (boost::compute::opencl_error const &err){
         std::cerr << err.what() << "\n";
     }

     return EXIT_SUCCESS;
 }

Solution

  • The kernel code generated by the transform() function in Boost.Compute should be almost identical to the kernel code you use in the C++ wrapper version (though Boost.Compute will do some unrolling).

    The reason you see a difference in timings is that in the first version you are only measuring the time it takes to enqueue the kernel and map the results back to the host. In the Boost.Compute version you are also measuring the amount of time it takes to create the transform() kernel, compile it, and then execute it. If you want a more realistic comparison you should measure the total execution time for the first example including the time it takes to set up and compile the OpenCL program.

    This initialization penalty (which is inherent in OpenCL's run-time compilation model) is somewhat mitigated in Boost.Compute by automatically caching compiled kernels during run-time (and also optionally caching them offline for reuse the next time the program is run). Calling transform() multiple times will be much faster after the first invocation.

    P.S. You can also just use the core wrapper classes in Boost.Compute (like device and context) along with the container classes (like vector<T>) and still run your own custom kernels.