python-3.xoptimizationcompilationnumbasorting-network

Numba compilation time exponentially exploding - can optimization level be configured like a c compiler?


This code implements sorting networks, and I'm trying to compile them with Numba for performance improvements. However, the compilation time for each function is growing exponentially. With around 60 functions in total (only 19 examples shown below), Numba will not finish compiling them before the sun undergoes red giant expansion.

I suspect the issue lies in Numba attempting to apply aggressive optimization flags like -O2 during compilation, leading to excessive complexity and processing times.

EDIT: I found that numba takes his optimization level from environment variable NUMBA_OPT, so I set it to 0 with

import os
os.environ["NUMBA_OPT"] = "0"

but it does nothing.

Is there a way to instruct Numba to simply generate the assembly code for these functions without attempting further optimizations? Or any other way to get it compiled?

import numba as nb
import numpy as np

# This function calculates the min and the max of his parameters.
@nb.njit(nb.types.UniTuple(nb.uint64, 2)(nb.uint64, nb.uint64),fastmath=True,inline='always')
def m(a: np.uint64, b: np.uint64) -> (np.uint64, np.uint64):
    return min(a, b), max(a, b)

@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_1(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    return a

print('Defining function 2')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_2(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[1] = m(a[0], a[1])
    return a

print('Defining function 3')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_3(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[2] = m(a[0], a[2])
    a[0], a[1] = m(a[0], a[1])
    a[1], a[2] = m(a[1], a[2])
    return a

print('Defining function 4')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_4(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[2] = m(a[0], a[2])
    a[1], a[3] = m(a[1], a[3])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[1], a[2] = m(a[1], a[2])
    return a

print('Defining function 5')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_5(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[3] = m(a[0], a[3])
    a[1], a[4] = m(a[1], a[4])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[3] = m(a[1], a[3])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[4] = m(a[2], a[4])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    a[2], a[3] = m(a[2], a[3])
    return a

print('Defining function 6')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_6(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[5] = m(a[0], a[5])
    a[1], a[3] = m(a[1], a[3])
    a[2], a[4] = m(a[2], a[4])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    a[0], a[3] = m(a[0], a[3])
    a[2], a[5] = m(a[2], a[5])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    return a

print('Defining function 7')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_7(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[6] = m(a[0], a[6])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[4] = m(a[1], a[4])
    a[3], a[6] = m(a[3], a[6])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[5] = m(a[2], a[5])
    a[3], a[4] = m(a[3], a[4])
    a[1], a[2] = m(a[1], a[2])
    a[4], a[6] = m(a[4], a[6])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    return a

print('Defining function 8')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_8(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[2] = m(a[0], a[2])
    a[1], a[3] = m(a[1], a[3])
    a[4], a[6] = m(a[4], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[0], a[4] = m(a[0], a[4])
    a[1], a[5] = m(a[1], a[5])
    a[2], a[6] = m(a[2], a[6])
    a[3], a[7] = m(a[3], a[7])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[5] = m(a[3], a[5])
    a[1], a[4] = m(a[1], a[4])
    a[3], a[6] = m(a[3], a[6])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    return a

print('Defining function 9')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_9(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[3] = m(a[0], a[3])
    a[1], a[7] = m(a[1], a[7])
    a[2], a[5] = m(a[2], a[5])
    a[4], a[8] = m(a[4], a[8])
    a[0], a[7] = m(a[0], a[7])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[8] = m(a[3], a[8])
    a[5], a[6] = m(a[5], a[6])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[3] = m(a[1], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[7], a[8] = m(a[7], a[8])
    a[1], a[4] = m(a[1], a[4])
    a[3], a[6] = m(a[3], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[5] = m(a[3], a[5])
    a[6], a[8] = m(a[6], a[8])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    return a

print('Defining function 10')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_10(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[8] = m(a[0], a[8])
    a[1], a[9] = m(a[1], a[9])
    a[2], a[7] = m(a[2], a[7])
    a[3], a[5] = m(a[3], a[5])
    a[4], a[6] = m(a[4], a[6])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[4] = m(a[1], a[4])
    a[5], a[8] = m(a[5], a[8])
    a[7], a[9] = m(a[7], a[9])
    a[0], a[3] = m(a[0], a[3])
    a[2], a[4] = m(a[2], a[4])
    a[5], a[7] = m(a[5], a[7])
    a[6], a[9] = m(a[6], a[9])
    a[0], a[1] = m(a[0], a[1])
    a[3], a[6] = m(a[3], a[6])
    a[8], a[9] = m(a[8], a[9])
    a[1], a[5] = m(a[1], a[5])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[8] = m(a[4], a[8])
    a[6], a[7] = m(a[6], a[7])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[5] = m(a[3], a[5])
    a[4], a[6] = m(a[4], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    return a

print('Defining function 11')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_11(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[9] = m(a[0], a[9])
    a[1], a[6] = m(a[1], a[6])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[7] = m(a[3], a[7])
    a[5], a[8] = m(a[5], a[8])
    a[0], a[1] = m(a[0], a[1])
    a[3], a[5] = m(a[3], a[5])
    a[4], a[10] = m(a[4], a[10])
    a[6], a[9] = m(a[6], a[9])
    a[7], a[8] = m(a[7], a[8])
    a[1], a[3] = m(a[1], a[3])
    a[2], a[5] = m(a[2], a[5])
    a[4], a[7] = m(a[4], a[7])
    a[8], a[10] = m(a[8], a[10])
    a[0], a[4] = m(a[0], a[4])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[7] = m(a[3], a[7])
    a[5], a[9] = m(a[5], a[9])
    a[6], a[8] = m(a[6], a[8])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[6] = m(a[2], a[6])
    a[4], a[5] = m(a[4], a[5])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[6] = m(a[3], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    return a

print('Defining function 12')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_12(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[8] = m(a[0], a[8])
    a[1], a[7] = m(a[1], a[7])
    a[2], a[6] = m(a[2], a[6])
    a[3], a[11] = m(a[3], a[11])
    a[4], a[10] = m(a[4], a[10])
    a[5], a[9] = m(a[5], a[9])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[5] = m(a[2], a[5])
    a[3], a[4] = m(a[3], a[4])
    a[6], a[9] = m(a[6], a[9])
    a[7], a[8] = m(a[7], a[8])
    a[10], a[11] = m(a[10], a[11])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[6] = m(a[1], a[6])
    a[5], a[10] = m(a[5], a[10])
    a[9], a[11] = m(a[9], a[11])
    a[0], a[3] = m(a[0], a[3])
    a[1], a[2] = m(a[1], a[2])
    a[4], a[6] = m(a[4], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[8], a[11] = m(a[8], a[11])
    a[9], a[10] = m(a[9], a[10])
    a[1], a[4] = m(a[1], a[4])
    a[3], a[5] = m(a[3], a[5])
    a[6], a[8] = m(a[6], a[8])
    a[7], a[10] = m(a[7], a[10])
    a[1], a[3] = m(a[1], a[3])
    a[2], a[5] = m(a[2], a[5])
    a[6], a[9] = m(a[6], a[9])
    a[8], a[10] = m(a[8], a[10])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[4], a[6] = m(a[4], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    return a

print('Defining function 13')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_13(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[12] = m(a[0], a[12])
    a[1], a[10] = m(a[1], a[10])
    a[2], a[9] = m(a[2], a[9])
    a[3], a[7] = m(a[3], a[7])
    a[5], a[11] = m(a[5], a[11])
    a[6], a[8] = m(a[6], a[8])
    a[1], a[6] = m(a[1], a[6])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[11] = m(a[4], a[11])
    a[7], a[9] = m(a[7], a[9])
    a[8], a[10] = m(a[8], a[10])
    a[0], a[4] = m(a[0], a[4])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[6] = m(a[3], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[11], a[12] = m(a[11], a[12])
    a[4], a[6] = m(a[4], a[6])
    a[5], a[9] = m(a[5], a[9])
    a[8], a[11] = m(a[8], a[11])
    a[10], a[12] = m(a[10], a[12])
    a[0], a[5] = m(a[0], a[5])
    a[3], a[8] = m(a[3], a[8])
    a[4], a[7] = m(a[4], a[7])
    a[6], a[11] = m(a[6], a[11])
    a[9], a[10] = m(a[9], a[10])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[5] = m(a[2], a[5])
    a[6], a[9] = m(a[6], a[9])
    a[7], a[8] = m(a[7], a[8])
    a[10], a[11] = m(a[10], a[11])
    a[1], a[3] = m(a[1], a[3])
    a[2], a[4] = m(a[2], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[9], a[10] = m(a[9], a[10])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[7] = m(a[5], a[7])
    a[6], a[8] = m(a[6], a[8])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    return a

print('Defining function 14')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_14(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[13] = m(a[12], a[13])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[3] = m(a[1], a[3])
    a[4], a[8] = m(a[4], a[8])
    a[5], a[9] = m(a[5], a[9])
    a[10], a[12] = m(a[10], a[12])
    a[11], a[13] = m(a[11], a[13])
    a[0], a[4] = m(a[0], a[4])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[7] = m(a[3], a[7])
    a[5], a[8] = m(a[5], a[8])
    a[6], a[10] = m(a[6], a[10])
    a[9], a[13] = m(a[9], a[13])
    a[11], a[12] = m(a[11], a[12])
    a[0], a[6] = m(a[0], a[6])
    a[1], a[5] = m(a[1], a[5])
    a[3], a[9] = m(a[3], a[9])
    a[4], a[10] = m(a[4], a[10])
    a[7], a[13] = m(a[7], a[13])
    a[8], a[12] = m(a[8], a[12])
    a[2], a[10] = m(a[2], a[10])
    a[3], a[11] = m(a[3], a[11])
    a[4], a[6] = m(a[4], a[6])
    a[7], a[9] = m(a[7], a[9])
    a[1], a[3] = m(a[1], a[3])
    a[2], a[8] = m(a[2], a[8])
    a[5], a[11] = m(a[5], a[11])
    a[6], a[7] = m(a[6], a[7])
    a[10], a[12] = m(a[10], a[12])
    a[1], a[4] = m(a[1], a[4])
    a[2], a[6] = m(a[2], a[6])
    a[3], a[5] = m(a[3], a[5])
    a[7], a[11] = m(a[7], a[11])
    a[8], a[10] = m(a[8], a[10])
    a[9], a[12] = m(a[9], a[12])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[6] = m(a[3], a[6])
    a[5], a[8] = m(a[5], a[8])
    a[7], a[10] = m(a[7], a[10])
    a[9], a[11] = m(a[9], a[11])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[6], a[7] = m(a[6], a[7])
    return a

print('Defining function 15')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_15(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[1], a[2] = m(a[1], a[2])
    a[3], a[10] = m(a[3], a[10])
    a[4], a[14] = m(a[4], a[14])
    a[5], a[8] = m(a[5], a[8])
    a[6], a[13] = m(a[6], a[13])
    a[7], a[12] = m(a[7], a[12])
    a[9], a[11] = m(a[9], a[11])
    a[0], a[14] = m(a[0], a[14])
    a[1], a[5] = m(a[1], a[5])
    a[2], a[8] = m(a[2], a[8])
    a[3], a[7] = m(a[3], a[7])
    a[6], a[9] = m(a[6], a[9])
    a[10], a[12] = m(a[10], a[12])
    a[11], a[13] = m(a[11], a[13])
    a[0], a[7] = m(a[0], a[7])
    a[1], a[6] = m(a[1], a[6])
    a[2], a[9] = m(a[2], a[9])
    a[4], a[10] = m(a[4], a[10])
    a[5], a[11] = m(a[5], a[11])
    a[8], a[13] = m(a[8], a[13])
    a[12], a[14] = m(a[12], a[14])
    a[0], a[6] = m(a[0], a[6])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[5] = m(a[3], a[5])
    a[7], a[11] = m(a[7], a[11])
    a[8], a[10] = m(a[8], a[10])
    a[9], a[12] = m(a[9], a[12])
    a[13], a[14] = m(a[13], a[14])
    a[0], a[3] = m(a[0], a[3])
    a[1], a[2] = m(a[1], a[2])
    a[4], a[7] = m(a[4], a[7])
    a[5], a[9] = m(a[5], a[9])
    a[6], a[8] = m(a[6], a[8])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[13] = m(a[12], a[13])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[6] = m(a[4], a[6])
    a[7], a[9] = m(a[7], a[9])
    a[10], a[12] = m(a[10], a[12])
    a[11], a[13] = m(a[11], a[13])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[5] = m(a[3], a[5])
    a[8], a[10] = m(a[8], a[10])
    a[11], a[12] = m(a[11], a[12])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[10], a[11] = m(a[10], a[11])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    return a

print('Defining function 16')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_16(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[13] = m(a[0], a[13])
    a[1], a[12] = m(a[1], a[12])
    a[2], a[15] = m(a[2], a[15])
    a[3], a[14] = m(a[3], a[14])
    a[4], a[8] = m(a[4], a[8])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[11] = m(a[7], a[11])
    a[9], a[10] = m(a[9], a[10])
    a[0], a[5] = m(a[0], a[5])
    a[1], a[7] = m(a[1], a[7])
    a[2], a[9] = m(a[2], a[9])
    a[3], a[4] = m(a[3], a[4])
    a[6], a[13] = m(a[6], a[13])
    a[8], a[14] = m(a[8], a[14])
    a[10], a[15] = m(a[10], a[15])
    a[11], a[12] = m(a[11], a[12])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[8] = m(a[6], a[8])
    a[7], a[9] = m(a[7], a[9])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[13] = m(a[12], a[13])
    a[14], a[15] = m(a[14], a[15])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[3] = m(a[1], a[3])
    a[4], a[10] = m(a[4], a[10])
    a[5], a[11] = m(a[5], a[11])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[12], a[14] = m(a[12], a[14])
    a[13], a[15] = m(a[13], a[15])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[12] = m(a[3], a[12])
    a[4], a[6] = m(a[4], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[8], a[10] = m(a[8], a[10])
    a[9], a[11] = m(a[9], a[11])
    a[13], a[14] = m(a[13], a[14])
    a[1], a[4] = m(a[1], a[4])
    a[2], a[6] = m(a[2], a[6])
    a[5], a[8] = m(a[5], a[8])
    a[7], a[10] = m(a[7], a[10])
    a[9], a[13] = m(a[9], a[13])
    a[11], a[14] = m(a[11], a[14])
    a[2], a[4] = m(a[2], a[4])
    a[3], a[6] = m(a[3], a[6])
    a[9], a[12] = m(a[9], a[12])
    a[11], a[13] = m(a[11], a[13])
    a[3], a[5] = m(a[3], a[5])
    a[6], a[8] = m(a[6], a[8])
    a[7], a[9] = m(a[7], a[9])
    a[10], a[12] = m(a[10], a[12])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[11], a[12] = m(a[11], a[12])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    return a

print('Defining function 17')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_17(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[11] = m(a[0], a[11])
    a[1], a[15] = m(a[1], a[15])
    a[2], a[10] = m(a[2], a[10])
    a[3], a[5] = m(a[3], a[5])
    a[4], a[6] = m(a[4], a[6])
    a[8], a[12] = m(a[8], a[12])
    a[9], a[16] = m(a[9], a[16])
    a[13], a[14] = m(a[13], a[14])
    a[0], a[6] = m(a[0], a[6])
    a[1], a[13] = m(a[1], a[13])
    a[2], a[8] = m(a[2], a[8])
    a[4], a[14] = m(a[4], a[14])
    a[5], a[15] = m(a[5], a[15])
    a[7], a[11] = m(a[7], a[11])
    a[0], a[8] = m(a[0], a[8])
    a[3], a[7] = m(a[3], a[7])
    a[4], a[9] = m(a[4], a[9])
    a[6], a[16] = m(a[6], a[16])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[14] = m(a[12], a[14])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[4] = m(a[1], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[13] = m(a[7], a[13])
    a[8], a[9] = m(a[8], a[9])
    a[10], a[12] = m(a[10], a[12])
    a[11], a[14] = m(a[11], a[14])
    a[15], a[16] = m(a[15], a[16])
    a[0], a[3] = m(a[0], a[3])
    a[2], a[5] = m(a[2], a[5])
    a[6], a[11] = m(a[6], a[11])
    a[7], a[10] = m(a[7], a[10])
    a[9], a[13] = m(a[9], a[13])
    a[12], a[15] = m(a[12], a[15])
    a[14], a[16] = m(a[14], a[16])
    a[0], a[1] = m(a[0], a[1])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[10] = m(a[5], a[10])
    a[6], a[9] = m(a[6], a[9])
    a[7], a[8] = m(a[7], a[8])
    a[11], a[15] = m(a[11], a[15])
    a[13], a[14] = m(a[13], a[14])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[7] = m(a[3], a[7])
    a[4], a[8] = m(a[4], a[8])
    a[6], a[12] = m(a[6], a[12])
    a[11], a[13] = m(a[11], a[13])
    a[14], a[15] = m(a[14], a[15])
    a[1], a[3] = m(a[1], a[3])
    a[2], a[7] = m(a[2], a[7])
    a[4], a[5] = m(a[4], a[5])
    a[9], a[11] = m(a[9], a[11])
    a[10], a[12] = m(a[10], a[12])
    a[13], a[14] = m(a[13], a[14])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[6] = m(a[4], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[8], a[10] = m(a[8], a[10])
    a[3], a[4] = m(a[3], a[4])
    a[6], a[8] = m(a[6], a[8])
    a[7], a[9] = m(a[7], a[9])
    a[10], a[12] = m(a[10], a[12])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[11], a[12] = m(a[11], a[12])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[13] = m(a[12], a[13])
    return a

print('Defining function 18')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_18(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[13] = m(a[12], a[13])
    a[14], a[15] = m(a[14], a[15])
    a[16], a[17] = m(a[16], a[17])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[3] = m(a[1], a[3])
    a[4], a[12] = m(a[4], a[12])
    a[5], a[13] = m(a[5], a[13])
    a[6], a[8] = m(a[6], a[8])
    a[9], a[11] = m(a[9], a[11])
    a[14], a[16] = m(a[14], a[16])
    a[15], a[17] = m(a[15], a[17])
    a[0], a[14] = m(a[0], a[14])
    a[1], a[16] = m(a[1], a[16])
    a[2], a[15] = m(a[2], a[15])
    a[3], a[17] = m(a[3], a[17])
    a[0], a[6] = m(a[0], a[6])
    a[1], a[10] = m(a[1], a[10])
    a[2], a[9] = m(a[2], a[9])
    a[7], a[16] = m(a[7], a[16])
    a[8], a[15] = m(a[8], a[15])
    a[11], a[17] = m(a[11], a[17])
    a[1], a[4] = m(a[1], a[4])
    a[3], a[9] = m(a[3], a[9])
    a[5], a[7] = m(a[5], a[7])
    a[8], a[14] = m(a[8], a[14])
    a[10], a[12] = m(a[10], a[12])
    a[13], a[16] = m(a[13], a[16])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[5] = m(a[2], a[5])
    a[3], a[13] = m(a[3], a[13])
    a[4], a[14] = m(a[4], a[14])
    a[7], a[9] = m(a[7], a[9])
    a[8], a[10] = m(a[8], a[10])
    a[12], a[15] = m(a[12], a[15])
    a[16], a[17] = m(a[16], a[17])
    a[1], a[2] = m(a[1], a[2])
    a[3], a[5] = m(a[3], a[5])
    a[4], a[6] = m(a[4], a[6])
    a[11], a[13] = m(a[11], a[13])
    a[12], a[14] = m(a[12], a[14])
    a[15], a[16] = m(a[15], a[16])
    a[4], a[8] = m(a[4], a[8])
    a[5], a[12] = m(a[5], a[12])
    a[6], a[10] = m(a[6], a[10])
    a[7], a[11] = m(a[7], a[11])
    a[9], a[13] = m(a[9], a[13])
    a[1], a[4] = m(a[1], a[4])
    a[2], a[8] = m(a[2], a[8])
    a[3], a[6] = m(a[3], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[9], a[15] = m(a[9], a[15])
    a[10], a[12] = m(a[10], a[12])
    a[11], a[14] = m(a[11], a[14])
    a[13], a[16] = m(a[13], a[16])
    a[2], a[4] = m(a[2], a[4])
    a[5], a[8] = m(a[5], a[8])
    a[6], a[10] = m(a[6], a[10])
    a[7], a[11] = m(a[7], a[11])
    a[9], a[12] = m(a[9], a[12])
    a[13], a[15] = m(a[13], a[15])
    a[3], a[5] = m(a[3], a[5])
    a[6], a[8] = m(a[6], a[8])
    a[7], a[10] = m(a[7], a[10])
    a[9], a[11] = m(a[9], a[11])
    a[12], a[14] = m(a[12], a[14])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[11], a[12] = m(a[11], a[12])
    a[13], a[14] = m(a[13], a[14])
    return a

print('Defining function 19')
@nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
def sort_small_array_19(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
    a[0], a[12] = m(a[0], a[12])
    a[1], a[4] = m(a[1], a[4])
    a[2], a[8] = m(a[2], a[8])
    a[3], a[5] = m(a[3], a[5])
    a[6], a[17] = m(a[6], a[17])
    a[7], a[11] = m(a[7], a[11])
    a[9], a[14] = m(a[9], a[14])
    a[10], a[13] = m(a[10], a[13])
    a[15], a[16] = m(a[15], a[16])
    a[0], a[2] = m(a[0], a[2])
    a[1], a[7] = m(a[1], a[7])
    a[3], a[6] = m(a[3], a[6])
    a[4], a[11] = m(a[4], a[11])
    a[5], a[17] = m(a[5], a[17])
    a[8], a[12] = m(a[8], a[12])
    a[10], a[15] = m(a[10], a[15])
    a[13], a[16] = m(a[13], a[16])
    a[14], a[18] = m(a[14], a[18])
    a[3], a[10] = m(a[3], a[10])
    a[4], a[14] = m(a[4], a[14])
    a[5], a[15] = m(a[5], a[15])
    a[6], a[13] = m(a[6], a[13])
    a[7], a[9] = m(a[7], a[9])
    a[11], a[17] = m(a[11], a[17])
    a[16], a[18] = m(a[16], a[18])
    a[0], a[7] = m(a[0], a[7])
    a[1], a[10] = m(a[1], a[10])
    a[4], a[6] = m(a[4], a[6])
    a[9], a[15] = m(a[9], a[15])
    a[11], a[16] = m(a[11], a[16])
    a[12], a[17] = m(a[12], a[17])
    a[13], a[14] = m(a[13], a[14])
    a[0], a[3] = m(a[0], a[3])
    a[2], a[6] = m(a[2], a[6])
    a[5], a[7] = m(a[5], a[7])
    a[8], a[11] = m(a[8], a[11])
    a[12], a[16] = m(a[12], a[16])
    a[1], a[8] = m(a[1], a[8])
    a[2], a[9] = m(a[2], a[9])
    a[3], a[4] = m(a[3], a[4])
    a[6], a[15] = m(a[6], a[15])
    a[7], a[13] = m(a[7], a[13])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[18] = m(a[12], a[18])
    a[1], a[3] = m(a[1], a[3])
    a[2], a[5] = m(a[2], a[5])
    a[6], a[9] = m(a[6], a[9])
    a[7], a[12] = m(a[7], a[12])
    a[8], a[10] = m(a[8], a[10])
    a[11], a[14] = m(a[11], a[14])
    a[17], a[18] = m(a[17], a[18])
    a[0], a[1] = m(a[0], a[1])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[8] = m(a[4], a[8])
    a[6], a[10] = m(a[6], a[10])
    a[9], a[12] = m(a[9], a[12])
    a[14], a[15] = m(a[14], a[15])
    a[16], a[17] = m(a[16], a[17])
    a[1], a[2] = m(a[1], a[2])
    a[5], a[8] = m(a[5], a[8])
    a[6], a[7] = m(a[6], a[7])
    a[9], a[11] = m(a[9], a[11])
    a[10], a[13] = m(a[10], a[13])
    a[14], a[16] = m(a[14], a[16])
    a[15], a[17] = m(a[15], a[17])
    a[3], a[6] = m(a[3], a[6])
    a[4], a[5] = m(a[4], a[5])
    a[7], a[9] = m(a[7], a[9])
    a[8], a[10] = m(a[8], a[10])
    a[11], a[12] = m(a[11], a[12])
    a[13], a[14] = m(a[13], a[14])
    a[15], a[16] = m(a[15], a[16])
    a[3], a[4] = m(a[3], a[4])
    a[5], a[6] = m(a[5], a[6])
    a[7], a[8] = m(a[7], a[8])
    a[9], a[10] = m(a[9], a[10])
    a[11], a[13] = m(a[11], a[13])
    a[12], a[14] = m(a[12], a[14])
    a[2], a[3] = m(a[2], a[3])
    a[4], a[5] = m(a[4], a[5])
    a[6], a[7] = m(a[6], a[7])
    a[8], a[9] = m(a[8], a[9])
    a[10], a[11] = m(a[10], a[11])
    a[12], a[13] = m(a[12], a[13])
    a[14], a[15] = m(a[14], a[15])
    return a
    
dummy=np.arange(63).astype(np.uint64)
print('Compiling functions (numba)...')
print('..compiling 1 of 63, '); sort_small_array_1(dummy[:1])
print('..compiling 2 of 63, '); sort_small_array_2(dummy[:2])
print('..compiling 3 of 63, '); sort_small_array_3(dummy[:3])
print('..compiling 4 of 63, '); sort_small_array_4(dummy[:4])
print('..compiling 5 of 63, '); sort_small_array_5(dummy[:5])
print('..compiling 6 of 63, '); sort_small_array_6(dummy[:6])
print('..compiling 7 of 63, '); sort_small_array_7(dummy[:7])
print('..compiling 8 of 63, '); sort_small_array_8(dummy[:8])
print('..compiling 9 of 63, '); sort_small_array_9(dummy[:9])
print('..compiling 10 of 63, '); sort_small_array_10(dummy[:10])

Solution

  • Compilation time analysis

    The times does not grow exponentially, but rather quadratically at first glance. We can see that by plotting the compilation time against the function number.

    timing plot

    The thing is the number of line also grows significantly and is highly correlated with the time taken to compile the function. The size of a sorting network grows in O(n log² n) so the compilation time should grow at least as much as this.

    In fact, some compilation steps do not grow linearly to the input code size, but more than that. Some steps like optimizations can grow in O(n²). For example, the register allocation is a particularly expensive step, especially in you case. Indeed, it is known top be NP-hard if we want to find the best solution though compiler use relatively fast heuristics in practice. For more information about the complexity of compiler algorithms, please read this post. In your case, I expect most operations to be done in either O(n) time or O(n log n) time. This means a O(v log(v)**3) compilation time for the function number v. It turns out that the current results match very well with a O(n log(n)) compilation time where n is the number of line, confirming the expected compilation time complexity mentioned above.

    On top of that, Numba has not been designed to generate code particularly quickly because the Numba code is expected to be small (as opposed to native code or even Cython one). AFAIK, a significantly part of the code responsible for translating the Python code to the LLVM intermediate representation is done in Python (so it is interpreted and rather slow).


    Faster compilation part 1: inlining

    That being said, one issue is that you use inline='always' which tells Numba to inline the function manually itself. This can significantly increase the size of the generated IR code and so the compilation time. Note that LLVM can do the inlining without the compilation flag if it considers inlining worth it. This manual inlining process takes most of the compilation time on my machine. Indeed, here are the execution time with and without it:

    timings with/without manual inlining

    The time of the last one (n°17) decrease from 3010 ms to 1360 ms.


    Faster compilation part 2: reducing the replicated code

    There is another issue: when you do a naive a[1], Numba generates a code to access data in the Numpy array. The thing is Numba needs to take care of the wraparound feature of Numpy, the stride which is not guaranteed to be one and extract the value of the Numpy array based on that. This generate a significant code for such a basic primitive operation repeated about 2000 times in your code... The thing is Numba does not support basic C array so I do not think we can remove this overhead completely in Numba (Cython can with user flags to disable Numpy/CPython features like the wraparound).

    Still, we can avoid Numba to generate a lot of repeated code and move the burden to LLVM which is more optimized. One solution is to modify the function m so it performs the array read/writes based on the indices. This drastically reduce the size of the replicated code. Here is a part of the modified code:

    import numba as nb
    import numpy as np
    
    # This function calculates the min and the max of his parameters.
    @nb.njit((nb.uint64[:], nb.uint64, nb.uint64),fastmath=True)
    def m(arr: 'np.ndarray[np.uint64]', ia: np.uint64, ib: np.uint64):
        va = arr[ia]
        vb = arr[ib]
        arr[ia] = min(va, vb)
        arr[ib] = max(va, vb)
    
    def run1(): 
        @nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
        def sort_small_array_1(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
            return a
    
    def run2(): 
        print('Defining function 2')
        @nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
        def sort_small_array_2(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
            m(a, 0, 1)
            return a
    
    def run3():
        print('Defining function 3')
        @nb.njit(nb.uint64[:](nb.uint64[:]),fastmath=True)
        def sort_small_array_3(a: 'np.ndarray[np.uint64]') -> 'np.ndarray[np.uint64]':
            m(a, 0, 2)
            m(a, 0, 1)
            m(a, 1, 2)
            return a
    
    # [...]
    

    Here is the resulting timings:

    plot of timing with all compilation optimizations

    The timings are about 10 times faster compilation for nearly all function except the fastest ones (e.g. 3010 ms versus 295 for the last function).

    One need to check the performance of the resulting code since LLVM might generate a sub-optimal code. Note that ahead-of-time compilation is better suited for such use-case since the compilation is done once. In fact, native languages like C are perfect for this use-case. The code should be compiled much faster and the resulting code will be faster because there is no wraparound. On top of that, multiple C files can be compiled in parallel which is AFAIK not possible in Numba.


    Notes

    By the way, large sorting networks are generally inefficient. First, their size are sub-optimal for large ones (they grow in O(n log² n) while fast sorting algorithms run in O(n log n). Moreover, CPU are not designed to execute huge code. Huge code are executed less efficiently because of many small caches in modern mainstream CPUs (µops cache, loop stream detector, L1 instruction cache, etc.). I advise you to reconsider the need to use sorting networks bigger than like 15, especially for a Numba code. An optimized insertion sort can be faster than big sorted network for few dozens of items because of that.

    Note that the "Compiling functions" part of the code is useless since Numba functions are eagerly compiled when a signature is provided (which is the case in your code).


    Conclusion

    The rule of thumb is "Don't repeat yourself" (DRY)! This is clearly not good for humans, nor compilers, nor CPUs.