I have a code that first samples random values from some domain of definition and then uses rejection sampling to regenerate them according to some double distribution:
import numpy as np
import sympy as sp
from sympy.utilities.lambdify import lambdify
from numba import jit, njit, prange
import time
# Step 1: Define Symbols
E1, E3, m, m1, m2, m3, thetaV, phiV, kappa = sp.symbols('E1 E3 m m1 m2 m3 thetaV phiV kappa')
# Numba-compatible example distribution function
@njit
def example_distribution_numba(e1, e3):
return np.exp(-((e1 - 1)**2 + (e3 - 0.8)**2))
@njit
def block_random_energies_old_numba(m, m1, m2, m3):
E3r = 0.0
E1r = 0.0
E2v = 0.0
valid = False
while not valid:
E3r = np.random.uniform(m3, (m**2 + m3**2 - (m1 + m2)**2) / (2 * m))
E1r = np.random.uniform(m1, (m**2 + m1**2 - (m2 + m3)**2) / (2 * m))
E2v = m - E1r - E3r
if E2v > m2:
term1 = (E2v**2 - m2**2 - (E1r**2 - m1**2) - (E3r**2 - m3**2))**2
term2 = 4 * (E1r**2 - m1**2) * (E3r**2 - m3**2)
valid = term1 < term2
return np.array([E1r, E3r])
@njit(parallel=True)
def block_random_energies_numba(m, m1, m2, m3, Nevents):
result = np.empty((Nevents, 2))
for i in prange(Nevents):
result[i] = block_random_energies_old_numba(m, m1, m2, m3)
return result
@njit(parallel=True)
def calculate_weights_numba(tabE1E3unweighted, m, m1, m2, m3):
weights = np.empty(len(tabE1E3unweighted))
for i in prange(len(tabE1E3unweighted)):
e1, e3 = tabE1E3unweighted[i]
weights[i] = example_distribution_numba(e1, e3)
return weights
def block_random_energies_weighted(m, m1, m2, m3, Nevents):
tabE1E3unweighted = block_random_energies_numba(m, m1, m2, m3, max(Nevents, 10**1))
# Calculate weights using Numba
weights1 = calculate_weights_numba(tabE1E3unweighted, m, m1, m2, m3)
weights1 = np.where(weights1 < 0, 0, weights1) # Ensure no negative weights
# Use numpy's choice without Numba for weighted sampling
tabsel_indeces = np.random.choice(len(tabE1E3unweighted), size=Nevents, p=weights1/weights1.sum())
return tabE1E3unweighted[tabsel_indeces]
# Parameters
MASSM = 2.0
MASS1 = 0.5
MASS2 = 0.4
MASS3 = 0.3
Nevents = 10**6
# Generate random energies
start_time = time.time()
tabPSenergies = block_random_energies_weighted(MASSM, MASS1, MASS2, MASS3, Nevents)
end_time = time.time()
print(f"Timing for generating {Nevents} random energies: {end_time - start_time} seconds")
It works slow:
Timing for generating 1000000 random energies: 3.3475124835968018 seconds
For 30000 times, it is not really faster:
Timing for generating 30000 random energies: 2.61208176612854 seconds
Could you please tell me whether it is possible to speed it up?
I have vectorized your functions and it got around 10x faster.
calculate_weights
and example_distribution
was rather straightforward, just removed loop and njit decoratorblock_random_energies
is much more complicated. First I generate as many candidates as is needed + some 20%
overhead (1.2 multiplier). Then I use "valid success rate" of the first pass to calculate how many samples I need to generate in the next iteration of the loop.def block_random_energies_vectorized(m, m1, m2, m3, n_events, success_rate=1.0):
n_valid = 0
n_missing = n_events
E1r_valid = np.zeros(n_events)
E3r_valid = np.zeros(n_events)
while n_missing > 0:
success_rate = min(max(success_rate, 0.001), 1.0) # clip to (0.001, 1) to fix some potential RAM issues
E3r = np.random.uniform(m3, (m ** 2 + m3 ** 2 - (m1 + m2) ** 2) / (2 * m), int(1.2 * n_missing / success_rate))
E1r = np.random.uniform(m1, (m ** 2 + m1 ** 2 - (m2 + m3) ** 2) / (2 * m), int(1.2 * n_missing / success_rate))
E2v = m - E1r - E3r
term1 = (E2v ** 2 - m2 ** 2 - (E1r ** 2 - m1 ** 2) - (E3r ** 2 - m3 ** 2)) ** 2
term2 = 4 * (E1r ** 2 - m1 ** 2) * (E3r ** 2 - m3 ** 2)
is_valid = np.logical_and(E2v > m2, term1 < term2)
current_n_valid = np.sum(is_valid)
n_new_to_add = min(current_n_valid, n_missing)
E1r_valid[n_valid:n_valid+n_new_to_add] = E1r[is_valid][:n_new_to_add]
E3r_valid[n_valid:n_valid+n_new_to_add] = E3r[is_valid][:n_new_to_add]
success_rate = current_n_valid / len(is_valid)
n_missing -= current_n_valid
n_valid += current_n_valid
return np.array([E1r_valid, E3r_valid]).T
def calculate_weights_vectorized(tabE1E3unweighted, m, m1, m2, m3):
return example_distribution_vectorized(tabE1E3unweighted[:, 0], tabE1E3unweighted[:, 1])
def example_distribution_vectorized(e1, e3):
return np.exp(-((e1 - 1) ** 2 + (e3 - 0.8) ** 2))
def block_random_energies_weighted(m, m1, m2, m3, Nevents):
# tabE1E3unweighted_old = block_random_energies_numba(m, m1, m2, m3, max(Nevents, 10 ** 1))
tabE1E3unweighted = block_random_energies_vectorized(m, m1, m2, m3, max(Nevents, 10 ** 1))
# Calculate weights using Numba
# weights1 = calculate_weights_numba(tabE1E3unweighted, m, m1, m2, m3)
weights1 = calculate_weights_vectorized(tabE1E3unweighted, m, m1, m2, m3)
weights1 = np.where(weights1 < 0, 0, weights1) # Ensure no negative weights
# Use numpy's choice without Numba for weighted sampling
tabsel_indeces = np.random.choice(len(tabE1E3unweighted), size=Nevents, p=weights1 / weights1.sum())
return tabE1E3unweighted[tabsel_indeces]
On my machine time went down from about 3.5s to abot 0.35s (10x speed-up). 60% of the time is now spent in the line:
tabsel_indeces = np.random.choice(len(tabE1E3unweighted), size=Nevents, p=weights1 / weights1.sum())
I also checked distributions of block_random_energies and they match each other
fig, axes = plt.subplots(2, 2)
axes[0, 0].hist(tabE1E3unweighted_old[:, 0], 200)
axes[0, 1].hist(tabE1E3unweighted_old[:, 1], 200)
axes[1, 0].hist(tabE1E3unweighted[:, 0], 200)
axes[1, 1].hist(tabE1E3unweighted[:, 1], 200)
plt.show()