C++ function (Vectors) wrapped with Cython being around 4 times slower than equivalent Cython function (NumPy Arrays MemoryViews), with large arrays

Having 2 functions with an algorithm equivalence in both sides, in C++ cpp_func1 (wrapped with Cython) and in Cython cython_func1, being the main difference between them that the C++ function works with Vectors and the Cython function works with NumPy Array and MemoryViews, and making the functions calls from Python with a NumPy Array of 100,000 elements, then when doing comparisons about the execution times, the measured times say that the calls to cpp_func1 wrapped function are being around 4 times slower at best (please read final extra reference).

I also started to see how to measure the maximum memory usage during each execution, and if the preliminary results are correct, then the calls to the cpp_func1 wrapped function show a much higher memory usage (more than 3 times higher if the results are correct).

So far, the Cython cython_func1 implementation, has been the one with best performance results and any improvement suggestion is welcome (for now, no Cython parallelization tested yet), but about the C++ cpp_func1 wrapper implementation with Cython probably I'm missing important optimizations, and this question is mainly about it, about how could it have a better performance.

The purpose about the benchmarking has been to compare in both sides an algorithm about loops and containers with an approximate equivalence, looking to work with container references in order to avoid making copies of the containers, to be highly efficient when working with large containers.

As extra reference, in another very similar case about function equivalence (cpp_func2 and cython_func2) also with loops and containers, the average execution time in a cpp_func2 wrapped function has been up to 24 times slower than the Cython equivalent function.

cython_funcs.pyx:

def cython_func1(double[::1] arr):
    cdef int n, m
    cdef int window = 5
    cdef int len = arr.shape[0] - window + 1
    cdef double min_var = 0.0
    cdef double max_var = 0.0
    cdef double diff_var = 0.0
    arr_result = np.zeros(len, dtype=np.double)
    cdef double[::1] arr_result_view = arr_result
    
    for n in range(len):
        diff_var = 0.0
        
        for m in range(n, (n + window)):
            if (m == n):
                min_var = arr[m]
                max_var = arr[m]
            else:
                if (arr[m] < min_var):
                    min_var = arr[m]
                elif (arr[m] > max_var):
                    max_var = arr[m]
          
        diff_var = max_var - min_var
        arr_result_view[n] = diff_var
    
    return arr_result

cython_funcs_setup.py:

# Compilation command (CMD): 
# python cython_funcs_setup.py build_ext --inplace

from setuptools import setup
from Cython.Build import cythonize
import numpy

setup(
    ext_modules = cythonize("cython_funcs.pyx"),
    include_path = [numpy.get_include()]
)

cpp_funcs.cpp:

std::vector<double> cpp_func1(const std::vector<double> &vec) {
    int window = 5;
    int len = vec.size() - window + 1;
    double min_var = 0.0;
    double max_var = 0.0;
    double diff_var = 0.0;
    std::vector<double> vec_result(len, 0.0);

    for (int n = 0; n < len; n++) {
        diff_var = 0.0;

        for (int m = n; m < (n + window); m++) {
            if (m == n) {
                min_var = vec[m];
                max_var = vec[m];
            }
            else {
                if (vec[m] < min_var) {
                    min_var = vec[m];
                }
                else if (vec[m] > max_var) {
                    max_var = vec[m];
                }
            }
        }
        diff_var = max_var - min_var;
        vec_result[n] = diff_var;
    }
    return vec_result;
}

cpp_funcs.h:

#ifndef CPP_FUNCS_H
#define CPP_FUNCS_H

#include <vector>
#include <stdint.h>
using namespace std;


std::vector<double> cpp_func1(const std::vector<double> &vec);

#endif

cpp_funcs_wrapper.pyx:

import numpy as np
from libcpp.vector cimport vector


#cdef extern from "cpp_funcs.cpp":
#    pass

cdef extern from "cpp_funcs.h":
    vector[double] cpp_func1(vector[double] &vec)


def cpp_func1_cython(vector[double] &vec):
    return cpp_func1(vec)

cpp_funcs_setup.py:

# Compilation command (CMD): 
# python cpp_funcs_setup.py build_ext --inplace

from setuptools import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy

extensions = [Extension("cpp_funcs_wrapper",
                        ["cpp_funcs_wrapper.pyx", "cpp_funcs.cpp"],
                        language="c++",
                        #extra_link_args=["-lz"]
                    )]
setup(
    name="cpp_funcs_wrapper", 
    ext_modules=cythonize(extensions),
    include_path = [numpy.get_include()]
)

Calls from Python:

import cython_funcs
import cpp_funcs_wrapper
import numpy as np
import timeit

arr = np.random.uniform(1, 100000, 100_000)


var1 = timeit.timeit("cython_funcs.cython_func1(arr)", globals=globals(), number=100)
print(f"Average time = {((var1/100) * 1000)} ms.")


var2 = timeit.timeit("cpp_funcs_wrapper.cpp_func1_cython(arr)", globals=globals(), number=100)
print(f"Average time = {((var2/100) * 1000)} ms")

UPDATE 1:
As mentioned in the comments below, I've tried to include flags about compilation optimizations in the Cython setup file, hopefully to get the C++ wrapped function working even more efficient than the pure Cython version, but part of the time happened the instructions about flags were ignored, and the other part happened that was a compilation error. When the flag/flags were ignored, the terminal showed a message in the way of: "unrecognized option '/O2'; ignored", getting the compilation completed but as the optimization part was ignored then the benchmark results kept in the same range, which means no improvements in the C++ wrapped function execution time:

About the flags, I've tried with more than 10 variants, but I will put one of them to have the general idea:

cpp_funcs_O_setup.py:

'''
The next is not the current Cython setup file, i.e. the execution time
measurements weren't done with the compilated files produced with this
code, and fact the next hasn't work about performing any optimization.
So, this is just to show what has been tried but never getting work the
part about optimization.

Reference about the next syntax, format, etc.:
https://cython.readthedocs.io/en/stable/src/tutorial/parallelization.html#compilation

All the attempts about compilation optimizations were only working with
the Cython setup file, i.e. not modifying anything in the command for
the compilation.

Compilation command (CMD): 
python cpp_funcs_O_setup.py build_ext --inplace
'''

from setuptools import Extension, setup
from Cython.Build import cythonize
import sys
#Also including the part about NumPy when needed.

if sys.platform.startswith("win"):
    compile_args = '/O2'
else:
    compile_args = '-O2'


ext_modules = [
    Extension(
        "cpp_funcs_wrapper",
        sources=["cpp_funcs_wrapper.pyx", "cpp_funcs.cpp"],
        language="c++",
        extra_compile_args=[compile_args],  # Also for example trying with the next directly here: '/O1', '/O2', '/O3', '/Ofast' '-O1', '-O2', '-O3', ... to see if one of them could work.
        extra_link_args=[compile_args],
        #extra_link_args=[compile_args],
    ),
    # Also trying to comment/omit the next `Extension` part, and other variants.
    Extension(
        "cpp_funcs_wrapper",
        sources=["cpp_funcs_wrapper.pyx", "cpp_funcs.cpp"],
        language="c++",
        extra_compile_args=[compile_args],
        extra_link_args=[compile_args],
        #extra_link_args=[compile_args],
    )
]

setup(
    name='cpp_funcs_wrapper',
    ext_modules=cythonize(ext_modules),
)

UPDATE 2:
The compilation command format is: python cpp_funcs_setup.py build_ext --inplace and here is relevant information shown in the terminal during the compilation process. It shows Visual Studio is being automatically invoked when working with the files above cython_funcs_setup.py and cpp_funcs_setup.py:

...>python cpp_funcs_setup.py build_ext --inplace
C:\...\Python\Python312\Lib\site-packages\setuptools\_distutils\dist.py:261: UserWarning: Unknown distribution option: 'include_path'
  warnings.warn(msg)
running build_ext
building 'cpp_funcs_wrapper' extension
"C:\...\... Visual Studio\2022\...\x64\cl.exe" /c /nologo /O2 /W3 /GL /DNDEBUG /MD -IC:\...\Python\Python312\include -IC:\...\Python\Python312\Include  ...  /EHsc /Tpcpp_funcs.cpp /Fobuild\temp.win-amd64-cpython-312\Release\cpp_funcs.obj
...
Generating code
Finished generating code
copying build\lib.win-amd64-cpython-312\cpp_funcs_wrapper.cp312-win_amd64.pyd ->
...

UPDATE 3:
About the compilation optimizations, now, as far as I can see, the part that had been ignored during the compilation process shown in the terminal is the content inside extra_link_args=[...], so, the part inside extra_compile_args=[...] seems to be being recognized, recognizing the argument /O2 in the Cython setup file, but what it is not recognizing is when the argument is /O3 (when /O3, then it is ignored). I briefly made some tests with something as extra_compile_args=['/O2', '/GL', '/fp:fast', '/Ot', '/Ob3'] and some variants, but for now couldn't see any change in the total execution times, so that's being the next step to cover.

Solution

The issue is that while your wrapper looks very simple, it is actually hiding a lot of (inefficient) work. If you write manual wrappers, you boost performance considerably.

Your wrapper:

def cpp_func1_cython(vector[double] &vec):
    return cpp_func1(vec)

Is roughly equivalent to:

def cpp_func1_cython(object input_data):
    cdef vector[double] vec

    # Convert input_data to vector[double]
    # Aside from the `push_back()` this pretty much all happens inside the 
    # python interpreter
    for item in input_data:
        vec.push_back(float(input_data))

    cdef vector[double] result = cpp_func1(vec)

    # convert vector[double] to list[object]
    # this mostly happens in the python interpreter too
    list_ = []
    for item in result:
        list_.append(float(item))
    # convert list[object] to ndarray[np.float64]
    # mostly happens in python interpreter too
    arr = np.array(list_)

    return arr

Performant Code

This wrapper that I wrote is marginally faster than your pure cython implementation on my machine.

Timings:

Your cpp impl: 10 ms per loop
Your cython impl: 1.8 ms per loop
My wrapper: 1.5 ms per loop

import numpy as np
import ctypes

cimport numpy as cnp

from libcpp.vector cimport vector
from libc.stdint cimport uintptr_t

cnp.import_array()

cdef extern from "cpp_funcs.h":
    vector[double] cpp_func1(vector[double] &vec)

# Implicitly converts an ndarray to vector[double]
# This is expensive, as the array is first converted to list[np.float64]
# as an intermediate
# The reverse is also done when returning the data
# vector[double] -> list[float] -> ndarray[np.float64]
def cpp_func1_cython(vector[double] &vec):
    return cpp_func1(vec)

def cpp_func_dispatcher(vector):
    if isinstance(vector, np.ndarray) and vector.dtype == np.float64:
        return dispatch_numpy(vector)
    else: # fallback to default implementation (or maybe raise TypeError)
        return cpp_func1_cython(vector)

# Implicitly converts an ndarray to double[:]
# This is cheap as double[::1] is just a view over the ndarray
# We *MUST* insist on `[::1]`, as it enforces that the memory is contiguous.
# It is important that the memory is contiguous, because we use pointer
# arithmetic to copy the data in `arr` to `vec`.
cdef dispatch_numpy(double[::1] arr):
    cdef vector[double] vec
    # copy view into vector
    # this is efficient as it is all done in C++, and is likely specialised to
    # copy the data as a single block
    vec.insert(vec.end(), &arr[0], (&arr[0])+len(arr))
    cdef vector[double] result = cpp_func1(vec)
    # copy vector back into an ndarray
    view = (ctypes.c_double * result.size()).from_address(<uintptr_t> &result[0])
    # this copy is efficient as numpy can just copy the data as a single block
    # NB. You *MUST* use copy() here, as the array returned by np.asarray()
    # is just a view over the memory owned by `vec`. The vector will drop
    # out of scope when this function returns and free its memory.
    return np.asarray(view).copy()

As this code needs to use numpy, it also needs access to the numpy header files. You will need to modify your Extension in your setup.py to include where to find those headers. eg.

extensions = [Extension(
    "cpp_funcs_wrapper",
    ["cpp_funcs_wrapper.pyx", "cpp_funcs.cpp"],
    language="c++",
    include_dirs=[numpy.get_include()],
)]