I would like to add significance stars (p-values) to the autocorrelations in a df (by column).
How can I incorporate the significance stars next to each autocorrelation coefficient?
from statsmodels.tsa.stattools import acf
def autocorr_with_asterisks(df):
"""
Calculate autocorrelation coefficients and add significance asterisks.
Parameters:
df (DataFrame): Input DataFrame with time series data.
Returns:
DataFrame: DataFrame containing autocorrelation coefficients with significance asterisks.
"""
autocorr_df = pd.DataFrame
asterisks = []
for col in df.columns:
acf_vals = acf(df[col],nlags=9, qstat=True)
autocorr_df[col] = acf_vals[0]
col_asterisks = []
for p_val in acf_vals[2]:
if p_val < 0.01:
col_asterisks.append('***')
elif p_val < 0.05:
col_asterisks.append('**')
elif p_val < 0.1:
col_asterisks.append('*')
else:
col_asterisks.append('')
asterisks.append(col_asterisks)
autocorr_df_with_asterisks = autocorr_df.astype(str) + np.array(asterisks).T
return autocorr_df_with_asterisks
Sample Data:
data = np.zeros((200, 5))
drift = 0.1
for col in range(5):
for i in range(1, 200):
data[i, col] = data[i - 1, col] + drift + np.random.randn()
#df follows an AR(1) process
df = pd.DataFrame(data, columns=['Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5'])
#df follows an AR(1) process
You weren't far off in your solution: Here is how you can solve this. I created a new dataframe to show one example. Note also that I put the number of lags in the definition of the function so that you may change it if you need.
from statsmodels.tsa.stattools import acf
import pandas as pd
import numpy as np
t = np.arange(250)
series1 = np.sin(t/10) + np.random.normal(0, 1, size=len(t))
series2 = np.cos(t/20) + np.random.normal(0, 1, size=len(t))
series3 = np.sin(t/30) + np.random.normal(0, 1, size=len(t))
series4 = np.random.normal(0.5, 0.5, size=len(t))
series5 = np.cumsum(np.random.normal(0.5, 0.5, size=len(t)))
df = pd.DataFrame({
'Series1': series1,
'Series2': series2,
'Series3': series3,
'Series4': series4,
'Series5': series5
})
def autocorr_df_with_significance(df, nlags=9):
"""
Calculate autocorrelation coefficients and add significance asterisks based on Ljung-Box Q statistic.
Parameters:
df (DataFrame): Input DataFrame with time series data.
nlags (int): Number of lags for autocorrelation calculation.
Returns:
DataFrame: DataFrame containing autocorrelation coefficients with significance asterisks.
"""
autocorr_df = pd.DataFrame(index=range(nlags+1))
for col in df.columns:
acf_vals = acf(df[col], nlags=nlags, fft=True)
qstats, p_vals = q_stat(acf_vals[1:], nobs=len(df))
p_vals = np.append([np.nan], p_vals)
asterisks = ['***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.1 else '' for p in p_vals]
autocorr_df[col] = [f"{val:.2f}{ast}" for val, ast in zip(acf_vals, asterisks)]
return autocorr_df
autocorr_df_with_significance = autocorr_with_significance(df)
autocorr_df_with_significance
which results in
Series1 Series2 Series3 Series4 Series5
0 1.00* 1.00* 1.00* 1.00* 1.00*
1 0.33* 0.22* 0.36* 0.08 0.99*
2 0.40* 0.26* 0.41* -0.02 0.98*
3 0.35* 0.22* 0.38* -0.08 0.96*
4 0.29* 0.28* 0.36* 0.01 0.95*
5 0.21* 0.27* 0.33* 0.05 0.94*
6 0.26* 0.24* 0.28* 0.08 0.93*
7 0.30* 0.23* 0.35* -0.03 0.91*
8 0.22* 0.26* 0.32* -0.04 0.90*
9 0.26* 0.18* 0.30* -0.00 0.89*
In this case, just one asterisk but if you have better actual data, it must produce what you want.