cythoncpythoncythonize

How to make Cython faster?


this is a portion of my code. I tried it in both python and cython. Cython is 2 sec faster (only when the return type is mentioned. otherwise, it is almost 3.5 sec slower than the python code) in this case. Is there any chance to make it faster. Any help/ discussion would be appreciated. Thank you.

%%cython

# %%cython --compile-args=-fopenmp --link-args=-fopenmp --force

cimport cython
cimport numpy as cnp
import numpy as np
from cython.parallel import parallel, prange

ctypedef cnp.int_t DTYPE

@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.nogil
@cython.cfunc
@cython.exceptval(-1)
@cython.returns(list )
cdef list sub_mat_extract ( cnp.ndarray[ DTYPE , ndim= 3] mat ,  cython.int neibors) : 
    
#     print('sub_mat_extract: ', np.shape(mat)  )

#     temp = []
    cdef:
        Py_ssize_t M = 0, N = 0, x =0
        Py_ssize_t i
        Py_ssize_t j
        Py_ssize_t row = np.shape(mat)[0] 
        Py_ssize_t col = np.shape(mat)[1] 
        
        list temp = []       
        list temp1 = []
        list dup1 = []  
        list dup2 = []
        
   
    for i in range(  ((neibors-1)/2) , row - ((neibors-1)/2) ):
        N = 0
        temp1 = []
        for j in range( col  ):
            temp1.extend(mat[ j + M ][ 0 + N : neibors + N])
    #         print(i,M, mat[i+M][0+N :3+N])
    #             print(temp1)


            if j + M == neibors + M-1:
                M = M + 1
                break
        temp.append(temp1)
        N += 1    
        if M == col:
            break

    dup1 = []
     

    for i in range(len(temp) ):
        x = 0
        while (x <= col - neibors):

            dup2 = []
            for j in range(len(temp[i])):
    #                 print([temp[i][j][0], temp[i][j][1]+x] )
                dup2.append([temp[i][j][0], temp[i][j][1]+x] )
            dup1.append(dup2)    
            x = x+1

        
    return (dup1)

def action(mat, neibor):
    return (sub_mat_extract(np.array(mat), neibor ))


the time for python version:

CPU times: total: 5.23 s
Wall time: 5.77 s

same for cython:

CPU times: total: 3.14 s
Wall time: 4.78 s

I am trying to convert all my codes from conventional python to cython. I want to see if in all cases, cython can be faster than python. My ultimate goal is to understand how fast a code can run (utilizing hardware (numba+multiprocess) and python-like compilers). I am running the codes in jupyter notebook only.


Solution

  • Here are some things in arbitrary order that crossed my mind while looking at your code snippet:

    Timing your code snippet on my machine yields:

    In [15]: M = np.ones((1000, 250, 250), dtype=np.int32)
    In [16]: %timeit res1 = action(M, 25)
    3.8 s ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    

    Here's an improved version of your code that needs to be compiled in C++ mode. The main advantage is that it uses a C++ STL vector and memoryviews to copy the specific rows of your mat array instead of lists. In addition, it returns a numpy array instead of a nested lists.

    %%cython -f -a -c=-O3
    
    #distutils: language = c++
    
    from cython cimport wraparound, boundscheck, cdivision
    cimport numpy as np
    import numpy as np
    from libcpp.vector cimport vector
    
    @boundscheck(False)
    @wraparound(False)
    cdef void copy_elements(vector[int]& out, int[:, ::1] mv):
        cdef int n_rows = mv.shape[0]
        cdef int n_cols = mv.shape[1]
        cdef int i, j
        for i in range(n_rows):
            for j in range(n_cols):
                out.push_back(mv[i][j])
    
    @boundscheck(False)
    @cdivision(True)
    @wraparound(False)
    def sub_mat_extract(int[:, :, ::1] mat, int neibors):
        cdef int M = 0
        cdef int x = 0
        cdef int i, j, k
        cdef int r = 0
        cdef int count = 0
        cdef int n_rows = mat.shape[0]
        cdef int n_cols = mat.shape[1]
        cdef int n_tmps = n_rows - neibors + 2
        cdef int neibors_pow2 = neibors*neibors
        cdef vector[int] temp
        cdef int[::1] out_mv
    
        for i in range(n_tmps):
            count += 1
            for j in range(n_cols):
                copy_elements(temp, mat[j+M][:neibors])
                if j == neibors - 1:
                    M += 1
                    break
            if M == n_cols:
                break
        # allocate memory
        out_mv = np.zeros(count*neibors_pow2*(n_cols-neibors+1)*2, dtype=np.int32)
        
        for i in range(count):
            for x in range(n_cols - neibors + 1):
                for j in range(neibors*neibors):
                    out_mv[r] = temp[i + j*n_tmps + 0]
                    r += 1
                    out_mv[r] = temp[i + j*n_tmps + 1] + x
                    r += 1
        return np.asarray(out_mv).reshape(-1, neibors_pow2, 2)
    

    Timing again on my machine (with the same input as above) shows that this approach is nearly 20 times faster:

    In [18]: %timeit res2 = sub_mat_extract(M, 25)
    198 ms ± 3.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)