import numpy as np
from scipy.stats import norm, rv_continuous
from scipy.special import erf
import scipy.integrate as integrate
class normal_ratio_wiki(rv_continuous):
def _pdf(self, z, mu_x, mu_y, sigma_x, sigma_y):
a_z = np.sqrt(((1/(sigma_x**2))*(np.power(z,2))) + (1/(sigma_y**2)))
b_z = ((mu_x/(sigma_x**2)) * z) + (mu_y/sigma_y**2)
c = ((mu_x**2)/(sigma_x**2)) + ((mu_y**2)/(sigma_y**2))
d_z = np.exp(((b_z**2)-((c*a_z**2))) / (2*(a_z**2)))
pdf_z = ((b_z * d_z) / (a_z**3)) * (1/(np.sqrt(2*math.pi)*sigma_x*sigma_y)) * \
(norm.cdf(b_z/a_z) - norm.cdf(-b_z/a_z)) + ((1/((a_z**2) * math.pi * sigma_x * sigma_y))*np.exp(-c/2))
return pdf_z
def _cdf(self, z, mu_x, mu_y, sigma_x, sigma_y):
cdf_z = integrate.quad(self._pdf, -np.inf, np.inf, args=(mu_x, mu_y, sigma_x, sigma_y))[0]
return cdf_z
rng1 = np.random.default_rng(99)
rng2 = np.random.default_rng(88)
# Sample Data 1
x = rng1.normal(141739.951, 1.223808e+06, 1000)
y = rng2.normal(333.91, 64.494571, 1000)
# Sample Data 2
# x = rng1.normal(500, 20, 1000)
# y = rng2.normal(400, 10, 1000)
z = x / y
# 1st approach with normal_ratio_wiki
mu_x = x.mean()
mu_y = y.mean()
sigma_x = x.std()
sigma_y = y.std()
rng3 = np.random.default_rng(11)
nr_wiki_inst = normal_ratio_wiki(name='normal_ratio_wiki', seed=rng3)
nr_wiki_vars = nr_wiki_inst.rvs(mu_x, mu_y, sigma_x, sigma_y, size = 100)
nr_wiki_params = nr_wiki_vars.fit(nr_wiki_vars)
Hello, I am working on simulating the ratio distribution of two uncorrelated normal distributions by defining the custom distribution using scipy
.
Approach is from here.
While calling scipy.dist.rvs
or fit
method from custom defined distribution above using either approach, I get the following errors RuntimeError: Failed to converge after 100 iterations.
and IntegrationWarning: The integral is probably divergent, or slowly convergent.
respectively. If we comment _cdf(...)
, then the process takes a lot of time to run scipy.dist.rvs
but then fails on calling fit
. Tried different bounded intervals but no success.
I believe implementing custom _rvs
, _ppf
and/ or _fit
methods may help resolving the issue. How should we define this based on above _pdf
and _cdf
methods? Please advise.
Note that example integrate.quad(nr_wiki_inst.pdf, -np.inf, np.inf, args=(mu_x, mu_y, sigma_x, sigma_y))
works separately without any issues.
The _cdf
method is essentially the same as the generic implementation, so the rest assumes that it's removed. I'm also assuming the _pdf
implementation is correct, and I haven't seen anything that suggests otherwise yet. It reduces to the Cauchy distribution when the mu
s are 0 and the sigma
s are 1, it seems to integrate to 1 even when the parameters change, and the PDF appears to agree with random variates (see below).
Integrating the PDF to get the CDF is bound to be pretty slow with quad
, but one thing that will make it much faster is to replace the use of scipy.stats.norm.cdf
with scipy.special.ndtr
in the PDF. SciPy distributions have a lot of overhead, but the special function jumps straight to evaluating the CDF of the normal distribution.
from scipy import stats, special
x = 1.5
%timeit special.ndtr(x)
# 1.43 µs ± 726 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%timeit stats.norm.cdf(x)
# 107 µs ± 21.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
This speeds up the CDF of your distribution by a factor of 10x-20x.
Improving the speed of RVS is also very easy. The generic implementation of _rvs
inverts the CDF (numerically) to get the PPF, and it passes in a uniformly distributed random number to get each observation (inverse transform sampling). We override _rvs
to just generate two normal random variates and take the ratio.
def _rvs(self, mu_x, mu_y, sigma_x, sigma_y, size=None, random_state=None):
x = random_state.normal(mu_x, sigma_x, size)
y = random_state.normal(mu_y, sigma_y, size)
return x/y
To check the agreement between this RVS implementation and the PDF:
rng = np.random.default_rng(8235834852452842)
nr_wiki_inst = normal_ratio_wiki(name='normal_ratio_wiki', seed=rng)
mu_x = 1
mu_y = 2
sigma_x = 3
sigma_y = 4
dist = nr_wiki_inst(mu_x, mu_y, sigma_x, sigma_y)
rvs = dist.rvs(size = 100_000)
plt.hist(rvs, density=True, bins=np.linspace(-10, 10, 300))
x = np.linspace(-10, 10, 300)
pdf = dist.pdf(x)
plt.plot(x, pdf)
This generated 100_000 random variates from the distribution (in 8ms) and allows us to visually compare the histogram against the PDF. The agreement is good, and the apparent excess of the histogram can be explained by the fact that the histogram is normalized over the range [-10, 10] instead of the entire real line.
You can get a fast (but approximate) PPF method using scipy.stats.sampling.NumericalInversePolynomial
. It's easier to describe what NumericalInverseHermite
does: evaluate the CDF at many points and use that data to interpolate the PPF. NumericalInversePolynomial
also works using interpolation, but is more advanced, and it only needs the PDF.
dist2 = sampling.NumericalInversePolynomial(dist, center=0, random_state=rng)
# show that `dist2`'s `ppf` method is the inverse of the original `cdf`
dist2.ppf(dist.cdf(1.5)) # 1.5000000001244032
I think fit
is having trouble in part because it doesn't know that sigma_x
and sigma_y
must be positive. For that, you need to define _argcheck
:
def _argcheck(self, mu_x, mu_y, sigma_x, sigma_y,):
return (sigma_x > 0.) & (sigma_y > 0.)
But there are other issues, and I don't think I'll debug it right now. Instead, we can define _shape_info
so that we can use the scipy.stats.fit
function with this distribution instead of the fit
method of the distribution.
# _ShapeInfo is private/not documented
# so don't use this for production code
from scipy.stats._distn_infrastructure import _ShapeInfo
...
def _shape_info(self):
# parameter name, discrete (True or False), domain, domain includes endpoints
return [_ShapeInfo("mu_x", False, (-np.inf, np.inf), (False, False)),
_ShapeInfo("mu_y", False, (-np.inf, np.inf), (False, False)),
_ShapeInfo("sigma_x", False, (0, np.inf), (False, False)),
_ShapeInfo("sigma_y", False, (0, np.inf), (False, False))]
Now the fit
function can be used:
# generate random variates to fit
rvs = dist.rvs(size=200)
# assign rough bounds to parameters
bounds = dict(mu_x=(-10, 10), mu_y=(-10, 10), sigma_x=(1e-1, 10), sigma_y=(1e-1, 10))
res = stats.fit(nr_wiki_inst, rvs, bounds=bounds)
# Show that negative log likelihood of fitted parameters is better
print(res.nllf((1, 2, 3, 4, 0, 1))) # 454.43887891645863
print(res.nllf()) # 453.19908132025944
# The latter corresponds with the fitted parameters.
# It's lower, so it's a better fit according to MLE
All together:
import numpy as np
import matplotlib.pyplot as plt
from scipy import integrate, stats, special
from scipy.stats import sampling
from scipy.stats._distn_infrastructure import _ShapeInfo
class normal_ratio_wiki(stats.rv_continuous):
def _pdf(self, z, mu_x, mu_y, sigma_x, sigma_y):
a_z = np.sqrt(((1/(sigma_x**2))*(np.power(z,2))) + (1/(sigma_y**2)))
b_z = ((mu_x/(sigma_x**2)) * z) + (mu_y/sigma_y**2)
c = ((mu_x**2)/(sigma_x**2)) + ((mu_y**2)/(sigma_y**2))
d_z = np.exp(((b_z**2)-((c*a_z**2))) / (2*(a_z**2)))
pdf_z = ((b_z * d_z) / (a_z**3)) * (1/(np.sqrt(2*np.pi)*sigma_x*sigma_y)) * \
(special.ndtr(b_z/a_z) - special.ndtr(-b_z/a_z)) + ((1/((a_z**2) * np.pi * sigma_x * sigma_y))*np.exp(-c/2))
return pdf_z
def _rvs(self, mu_x, mu_y, sigma_x, sigma_y, size=None, random_state=None):
x = random_state.normal(mu_x, sigma_x, size)
y = random_state.normal(mu_y, sigma_y, size)
return x/y
def _argcheck(self, mu_x, mu_y, sigma_x, sigma_y):
return (sigma_x > 0) & (sigma_y > 0)
def _shape_info(self):
return [_ShapeInfo("mu_x", False, (-np.inf, np.inf), (False, False)),
_ShapeInfo("mu_y", False, (-np.inf, np.inf), (False, False)),
_ShapeInfo("sigma_x", False, (0, np.inf), (False, False)),
_ShapeInfo("sigma_y", False, (0, np.inf), (False, False))]
rng = np.random.default_rng(11)
nr_wiki_inst = normal_ratio_wiki(name='normal_ratio_wiki', seed=rng)
mu_x = 1
mu_y = 2
sigma_x = 3
sigma_y = 4
dist = nr_wiki_inst(mu_x, mu_y, sigma_x, sigma_y)
rvs = dist.rvs(size = 100_000, random_state=rng)
x = np.linspace(-10, 10, 300)
pdf = dist.pdf(x)
plt.plot(x, pdf)
plt.hist(rvs, density=True, bins=np.linspace(-10, 10, 300))
dist2 = sampling.NumericalInversePolynomial(dist, center=0, random_state=rng)
dist2.ppf(dist.cdf(1.5)) # 1.5000000001244032
rvs = dist.rvs(size=200, random_state=rng)
bounds = dict(mu_x=(-10, 10), mu_y=(-10, 10), sigma_x=(1e-1, 10), sigma_y=(1e-1, 10))
res = stats.fit(nr_wiki_inst, rvs, bounds=bounds)
print(res.nllf((1, 2, 3, 4, 0, 1))) # 399.2571952688255
print(res.nllf()) # 396.24324681778955