PyOpenCl simple matrix multiplication

I am trying to learn PyOpenCl. I am following various tutorials/examples that I found online, and I have been trying to put together a simple matrix multiplication. I fail to see why I cannot get the right results: it seems to me that the for cycle in my kernel is not being executed (the output C_flat is always zeros) or maybe I am managing in the wrong way some memory. Can anyone give me any suggestions? Thanks a lot!

Here is the code:

import numpy as np
import pyopencl as cl
import time

def create_input_memory(context, input_arrays):
    return [(array, cl.Buffer(context, flags=cl.mem_flags.READ_ONLY, size=array.nbytes))
            for array in input_arrays]

def create_output_memory(context, output_arrays):
    return [(array, cl.Buffer(context, flags=cl.mem_flags.WRITE_ONLY, size=array.nbytes))
            for array in output_arrays]

def matrix_multiply_gpu(A, B):
    A_height, A_width = A.shape[0], A.shape[1]
    B_height, B_width = B.shape[0], B.shape[1]
    C = np.zeros((A_height, B_width))
    A_flat = A.flatten()
    B_flat = B.flatten()
    C_flat = C.flatten()
    print(C_flat)

    kernel_source = """
        kernel void mul(int Wa, int Ha, int Wb, int Hb,
                        global float *input_a,
                        global float *input_b,
                        global float *result){
            /* ROW MAJOR notation (I imagine the "GPU matrix") --> no, just model*/
            int row = get_global_id(0);
            int col = get_global_id(1);
            float sum = 0.0f;
            for (int i = 0; i < Wa; i++){
                sum += input_a[row * Wa + i] * input_b[i * Wb + col];
            }
            result[row * Wb + col] = sum;
                        }
    """
    platforms = cl.get_platforms()

    context = cl.Context(dev_type=cl.device_type.GPU,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
    gpu_program_source = cl.Program(context, kernel_source)
    gpu_program = gpu_program_source.build()

    input_tuples = create_input_memory(context,
                                        (A_flat, B_flat))
    output_tuples = create_output_memory(context, (C_flat,)) 

    gpu_queue = cl.CommandQueue(context)    
        
    kernel_arguments = [buffer for (_,buffer) in input_tuples]
    kernel_arguments += [buffer for (_,buffer) in output_tuples]

    gpu_program.mul(gpu_queue, (1024,), (32,), 
                        np.int32(A_height), np.int32(A_width), np.int32(B_height), 
                        np.int32(B_width), *kernel_arguments)

    for (array, buffer) in output_tuples:
        cl.enqueue_copy(gpu_queue, src=buffer, dest=array)
    
    #wait for everyone to finish
    gpu_queue.finish()

    return C_flat

if __name__=='__main__':
    A, B = np.ones((100, 100)), np.ones((100, 100))
    C = matrix_multiply_gpu(A, B)
    print("\n", C, "\n")

Solution

Here are some of the issues with your code:

You send height-width pairs, but your kernel reads width-height.
You are sending flattened arrays, but expect a 2 dim array in your kernel.
You didn't copy the data to the gpu, there is no COPY_HOST_PTR flag.
When your code is not working, trying to automate it with helper functions only obscures the logic. Do it after your code is working. I won't use your create_... functions.
I find it easier to just copy the array to the device, instead of making the buffer, that's why I'm going to do it that way, but It should work the same either way.
Always specify the dtype of the data you are sending, and make sure it matches with what your kernel expects. I have spent countless hours debugging just because of this.

Here is the code:

import numpy as np
import pyopencl as cl
import pyopencl.array

def matrix_multiply_gpu(A, B):
    # ----------------------------this is your code, with minor changes
    A_height, A_width = A.shape
    B_height, B_width = B.shape
    C = np.empty((A_height, B_width), dtype=np.float32)  # some changes
    A_flat = A.flatten()
    B_flat = B.flatten()
    C_flat = C.flatten()
    platforms = cl.get_platforms()
    context = cl.Context(dev_type=cl.device_type.GPU,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
    gpu_queue = cl.CommandQueue(context)    
    # ------------------------------------------------------------------

    # --------------------------------------This is new or modified code
    size = A_height * B_width

    kernel_source = """
        kernel void mul(int Ha, int Wa, int Hb, int Wb,
                        global const float *input_a,
                        global const float *input_b,
                        global       float *result     ){
            int gid = get_global_id(0);
            int Arow = gid / Wb;
            int Bcol = gid % Wb;
            float sum = 0.0f;
            for (int i = 0; i < Wa; i++)
                sum += input_a[Arow*Wa+i] * input_b[i*Wb+Bcol];
            result[gid] = sum;
        }
        """
    Ad = cl.array.to_device(gpu_queue, A_flat)
    Bd = cl.array.to_device(gpu_queue, B_flat)
    Cd = cl.array.to_device(gpu_queue, C_flat)
    # --------------------------------- and the return is different too
    
    # -------------------------------your code again with minor changes
    gpu_program_source = cl.Program(context, kernel_source)
    gpu_program = gpu_program_source.build()
    gpu_program.mul(gpu_queue, (size,), None,  # some changes here
                        np.int32(A_height), np.int32(A_width), 
                        np.int32(B_height), np.int32(B_width), 
                        Ad.data, Bd.data, Cd.data)
    # -----------------------------------------------------------------

    return Cd.get().reshape((A_height, B_width)) 
    # or send it flattened if you wish, without the reshape

if __name__=='__main__':
    from pprint import pprint
    A = 2.0*np.ones((5, 3), dtype=np.float32)
    B = 3.0*np.ones((3, 4), dtype=np.float32)
    
    C = matrix_multiply_gpu(A, B)
    pprint(C)