I have an age
variable. When I plotted it using the kde
& qq-plot
, the distribution seemed normal; however, when I performed the ks-test, the test statistics = 1.0
, p = 0.0
.
Can someone please help me explain this observation? I use the ks-test on other variables, and the result was consistent with the visualization for others.
# library
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sps
# the age variable
age = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])
# Visualization
fig, ax = plt.subplots(1,2) # Making (row, col) of plots
fig.set_figheight(4) # set height
fig.set_figwidth(8) # set width
sns.kdeplot(age, color = 'red',
alpha = .1, fill = 'true',
ax = ax[0]) # Distribution plot
sm.qqplot(age, fit = True, line = '45', ax = ax[1]) # qqplot
fig.tight_layout() # Tight layout
plt.show() # show plots
# KS test (because n > 50)
print('n =', age.size)
sps.kstest(age, 'norm')
@Timur Shtatland is correct. Your code is:
sps.kstest(age, 'norm')
without specifying the parameters of the normal distribution, you are comparing your data to a standard normal distribution (with mean 0 and standard deviation 1). So it not surprising that the p-value for the test is effectively zero. Instead you should use the mean and standard deviation of your data:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy import stats
# Data
data = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])
# Fit a normal distribution to the data
mu, std = norm.fit(data)
shapiro_test = stats.shapiro(data)
print("\nShapiro-Wilk Test:")
print("Statistic: {:.2f}".format(shapiro_test[0]))
print("p-value: {:.2f}".format(shapiro_test[1]))
# Perform the KS test for normality
ks_statistic, p_value = stats.kstest(data, 'norm', args=(mu, std))
print("\nKolmogorov-Smirnov Test:")
print("Statistic: {:.2f}".format(ks_statistic))
print("p-value: {:.2f}".format(p_value))```
which produces this:
Shapiro-Wilk Test:
Statistic: 0.99
p-value: 0.07
Kolmogorov-Smirnov Test:
Statistic: 0.05
p-value: 0.21
I would also plot a histogram of the data and then overlay a normal density with the parameters from your data:
# Create histogram of the data
count, bins, ignored = plt.hist(data, 20, density=True, alpha=0.5, color='gray')
# Plot the PDF of the normal distribution
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f, std = %.2f" % (mu, std)
plt.title(title)
plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.show()