Speedup sum of intensities calculation by using JOCL/OPENCL

Hi I'm new to JOCL (opencl). I wrote this code to take the sum of the intensities per image. The kernel takes a 1D array of all the pixels of all the images put behind eachother. An image is 300x300 , so that's 90000 pixels per image. At the moment it goes slower than when I do this sequentially.

My code

package PAR;

/*
 * JOCL - Java bindings for OpenCL
 * 
 * Copyright 2009 Marco Hutter - http://www.jocl.org/
 */
import IMAGE_IO.ImageReader;
import IMAGE_IO.Input_Folder;
import static org.jocl.CL.*;

import org.jocl.*;

/**
 * A small JOCL sample.
 */
public class IPPARA {

    /**
     * The source code of the OpenCL program to execute
     */
    private static String programSource =
            "__kernel void "
            + "sampleKernel(__global uint *a,"
            + "             __global uint *c)"
            + "{"
            + "__private uint intensity_core=0;"
            + "      uint i = get_global_id(0);"
            + "      for(uint j=i*90000; j < (i+1)*90000; j++){ "
            + "              intensity_core += a[j];"
            + "     }"
            + "c[i]=intensity_core;" 
            + "}";

    /**
     * The entry point of this sample
     *
     * @param args Not used
     */
    public static void main(String args[]) {
        long numBytes[] = new long[1];

        ImageReader imagereader = new ImageReader() ;
        int srcArrayA[]  = imagereader.readImages();

        int size[] = new int[1];
        size[0] = srcArrayA.length;
        long before = System.nanoTime();
        int dstArray[] = new int[size[0]/90000];


        Pointer srcA = Pointer.to(srcArrayA);
        Pointer dst = Pointer.to(dstArray);


        // Obtain the platform IDs and initialize the context properties
        System.out.println("Obtaining platform...");
        cl_platform_id platforms[] = new cl_platform_id[1];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);

        // Create an OpenCL context on a GPU device
        cl_context context = clCreateContextFromType(
                contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
        if (context == null) {
            // If no context for a GPU device could be created,
            // try to create one for a CPU device.
            context = clCreateContextFromType(
                    contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);

            if (context == null) {
                System.out.println("Unable to create a context");
                return;
            }
        }

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);

        // Get the list of GPU devices associated with the context
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);

        // Obtain the cl_device_id for the first device
        int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
                Pointer.to(devices), null);

        // Create a command-queue
        cl_command_queue commandQueue =
                clCreateCommandQueue(context, devices[0], 0, null);

        // Allocate the memory objects for the input- and output data
        cl_mem memObjects[] = new cl_mem[2];
        memObjects[0] = clCreateBuffer(context,
                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                Sizeof.cl_uint * srcArrayA.length, srcA, null);
        memObjects[1] = clCreateBuffer(context,
                CL_MEM_READ_WRITE,
                Sizeof.cl_uint * (srcArrayA.length/90000), null, null);

        // Create the program from the source code
        cl_program program = clCreateProgramWithSource(context,
                1, new String[]{programSource}, null, null);

        // Build the program
        clBuildProgram(program, 0, null, null, null, null);

        // Create the kernel
        cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);

        // Set the arguments for the kernel
        clSetKernelArg(kernel, 0,
                Sizeof.cl_mem, Pointer.to(memObjects[0]));
        clSetKernelArg(kernel, 1,
                Sizeof.cl_mem, Pointer.to(memObjects[1]));

        // Set the work-item dimensions
        long local_work_size[] = new long[]{1};
        long global_work_size[] = new long[]{(srcArrayA.length/90000)*local_work_size[0]};


        // Execute the kernel
        clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
                global_work_size, local_work_size, 0, null, null);

        // Read the output data
        clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE, 0,
                (srcArrayA.length/90000) * Sizeof.cl_float, dst, 0, null, null);

        // Release kernel, program, and memory objects
        clReleaseMemObject(memObjects[0]);
        clReleaseMemObject(memObjects[1]);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(commandQueue);
        clReleaseContext(context);


        long after = System.nanoTime();

        System.out.println("Time: " + (after - before) / 1e9);

    }
}

After the suggestions in the answers, the parallel code via the CPU is almost as fast as the sequential code. Are there any more improvements that can be made ?

Solution

 for(uint j=i*90000; j < (i+1)*90000; j++){ "
        + "              c[i] += a[j];"

1)You are using global memory(c[]) to sum and this is slow. Use a private-variable to make it faster. Something like this:

          "__kernel void "
        + "sampleKernel(__global uint *a,"
        + "             __global uint *c)"
        + "{"
        + "__private uint intensity_core=0;" <---this is a private variable of each core
        + "      uint i = get_global_id(0);"
        + "      for(uint j=i*90000; j < (i+1)*90000; j++){ "
        + "              intensity_core += a[j];" <---register is at least 100x faster than global memory
         //but we cannot get rid of a[] so the calculation time cannot be less than %50
        + "     }"
        + "c[i]=intensity_core;"   
        + "}";  //expecting %100 speedup

Now you have c[number of images] array of sum-of-intensities.

Your local-work-size is 1 then if you have at least 160 images(which is your gpu's core number) then the calculation will be using all cores.

You will need 90000*num_images times read and num_images write and 90000*num_images register read/write. Using registers will halve your kernel time.

2)You are doing only 1 math per 2 memory-access. You need at least 10 math per 1 memory-access to use a small-fraction of peak Gflops of you gpu (250 Gflops peak for 6490M)

Your i7 cpu can have 100 Gflops easily but your memory will be bottleneck. This is even worse when you send whole data throug pci-express .(HD Graphics 3000 is rated at 125 GFLOPS)

 // Obtain a device ID 
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
    cl_device_id device = devices[deviceIndex];
 //one of devices[] element must be your HD3000.Example: devices[0]->gpu devices[1]->cpu 
 //devices[2]-->HD3000

In your program:

 // Obtain the cl_device_id for the first device
    int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
            Pointer.to(devices), null);

takes the first device probagbly the gpu.