pythonperformancenumba

How to speed up the following python code generating random values from a distribution?


I have a code that first samples random values from some domain of definition and then uses rejection sampling to regenerate them according to some double distribution:

import numpy as np
import sympy as sp
from sympy.utilities.lambdify import lambdify
from numba import jit, njit, prange
import time

# Step 1: Define Symbols
E1, E3, m, m1, m2, m3, thetaV, phiV, kappa = sp.symbols('E1 E3 m m1 m2 m3 thetaV phiV kappa')

# Numba-compatible example distribution function
@njit
def example_distribution_numba(e1, e3):
    return np.exp(-((e1 - 1)**2 + (e3 - 0.8)**2))

@njit
def block_random_energies_old_numba(m, m1, m2, m3):
    E3r = 0.0
    E1r = 0.0
    E2v = 0.0
    valid = False
    while not valid:
        E3r = np.random.uniform(m3, (m**2 + m3**2 - (m1 + m2)**2) / (2 * m))
        E1r = np.random.uniform(m1, (m**2 + m1**2 - (m2 + m3)**2) / (2 * m))
        E2v = m - E1r - E3r
        if E2v > m2:
            term1 = (E2v**2 - m2**2 - (E1r**2 - m1**2) - (E3r**2 - m3**2))**2
            term2 = 4 * (E1r**2 - m1**2) * (E3r**2 - m3**2)
            valid = term1 < term2
    return np.array([E1r, E3r])

@njit(parallel=True)
def block_random_energies_numba(m, m1, m2, m3, Nevents):
    result = np.empty((Nevents, 2))
    for i in prange(Nevents):
        result[i] = block_random_energies_old_numba(m, m1, m2, m3)
    return result

@njit(parallel=True)
def calculate_weights_numba(tabE1E3unweighted, m, m1, m2, m3):
    weights = np.empty(len(tabE1E3unweighted))
    for i in prange(len(tabE1E3unweighted)):
        e1, e3 = tabE1E3unweighted[i]
        weights[i] = example_distribution_numba(e1, e3)
    return weights

def block_random_energies_weighted(m, m1, m2, m3, Nevents):
    tabE1E3unweighted = block_random_energies_numba(m, m1, m2, m3, max(Nevents, 10**1))
    
    # Calculate weights using Numba
    weights1 = calculate_weights_numba(tabE1E3unweighted, m, m1, m2, m3)
    weights1 = np.where(weights1 < 0, 0, weights1)  # Ensure no negative weights
    
    # Use numpy's choice without Numba for weighted sampling
    tabsel_indeces = np.random.choice(len(tabE1E3unweighted), size=Nevents, p=weights1/weights1.sum())
    
    return tabE1E3unweighted[tabsel_indeces]

# Parameters
MASSM = 2.0
MASS1 = 0.5
MASS2 = 0.4
MASS3 = 0.3
Nevents = 10**6

# Generate random energies
start_time = time.time()
tabPSenergies = block_random_energies_weighted(MASSM, MASS1, MASS2, MASS3, Nevents)
end_time = time.time()
print(f"Timing for generating {Nevents} random energies: {end_time - start_time} seconds")

It works slow:

Timing for generating 1000000 random energies: 3.3475124835968018 seconds

For 30000 times, it is not really faster:

Timing for generating 30000 random energies: 2.61208176612854 seconds

Could you please tell me whether it is possible to speed it up?


Solution

  • I have vectorized your functions and it got around 10x faster.

    def block_random_energies_vectorized(m, m1, m2, m3, n_events, success_rate=1.0):
        n_valid = 0
        n_missing = n_events
        E1r_valid = np.zeros(n_events)
        E3r_valid = np.zeros(n_events)
        while n_missing > 0:
            success_rate = min(max(success_rate, 0.001), 1.0)   # clip to (0.001, 1) to fix some potential RAM issues
            E3r = np.random.uniform(m3, (m ** 2 + m3 ** 2 - (m1 + m2) ** 2) / (2 * m), int(1.2 * n_missing / success_rate))
            E1r = np.random.uniform(m1, (m ** 2 + m1 ** 2 - (m2 + m3) ** 2) / (2 * m), int(1.2 * n_missing / success_rate))
            E2v = m - E1r - E3r
    
            term1 = (E2v ** 2 - m2 ** 2 - (E1r ** 2 - m1 ** 2) - (E3r ** 2 - m3 ** 2)) ** 2
            term2 = 4 * (E1r ** 2 - m1 ** 2) * (E3r ** 2 - m3 ** 2)
            is_valid = np.logical_and(E2v > m2, term1 < term2)
            current_n_valid = np.sum(is_valid)
    
            n_new_to_add = min(current_n_valid, n_missing)
            E1r_valid[n_valid:n_valid+n_new_to_add] = E1r[is_valid][:n_new_to_add]
            E3r_valid[n_valid:n_valid+n_new_to_add] = E3r[is_valid][:n_new_to_add]
            success_rate = current_n_valid / len(is_valid)
            n_missing -= current_n_valid
            n_valid += current_n_valid
    
        return np.array([E1r_valid, E3r_valid]).T
    
    
    
    def calculate_weights_vectorized(tabE1E3unweighted, m, m1, m2, m3):
        return example_distribution_vectorized(tabE1E3unweighted[:, 0], tabE1E3unweighted[:, 1])
    
    
    def example_distribution_vectorized(e1, e3):
        return np.exp(-((e1 - 1) ** 2 + (e3 - 0.8) ** 2))
    
    
    def block_random_energies_weighted(m, m1, m2, m3, Nevents):
        # tabE1E3unweighted_old = block_random_energies_numba(m, m1, m2, m3, max(Nevents, 10 ** 1))
        tabE1E3unweighted = block_random_energies_vectorized(m, m1, m2, m3, max(Nevents, 10 ** 1))
    
        # Calculate weights using Numba
        # weights1 = calculate_weights_numba(tabE1E3unweighted, m, m1, m2, m3)
        weights1 = calculate_weights_vectorized(tabE1E3unweighted, m, m1, m2, m3)
        weights1 = np.where(weights1 < 0, 0, weights1)  # Ensure no negative weights
    
        # Use numpy's choice without Numba for weighted sampling
        tabsel_indeces = np.random.choice(len(tabE1E3unweighted), size=Nevents, p=weights1 / weights1.sum())
    
        return tabE1E3unweighted[tabsel_indeces]
        
    
    

    On my machine time went down from about 3.5s to abot 0.35s (10x speed-up). 60% of the time is now spent in the line:

    tabsel_indeces = np.random.choice(len(tabE1E3unweighted), size=Nevents, p=weights1 / weights1.sum())
    

    I also checked distributions of block_random_energies and they match each other

    fig, axes = plt.subplots(2, 2)
    axes[0, 0].hist(tabE1E3unweighted_old[:, 0], 200)
    axes[0, 1].hist(tabE1E3unweighted_old[:, 1], 200)
    axes[1, 0].hist(tabE1E3unweighted[:, 0], 200)
    axes[1, 1].hist(tabE1E3unweighted[:, 1], 200)
    plt.show()
    

    Distributions comparison