I have a dataframe df
, I want to do the following:
column_1
to column_84
) to compare if there is a statistical difference between Types X
, Y
and Z
The stats tests are Kruskal
and Dunn's
tests
The comparing group: X vs Y
, Y vs Z
and X vs Z
# copy & paste
## generate dataframe "df"
import pandas as pd
import numpy as np
df = pd.DataFrame(
data=np.random.uniform(low=5.5, high=30.75, size=(60, 84)),
columns=[f'column_{i}' for i in range(1, 85)],)
df.insert(loc=0, column='Type',value=np.repeat(['X','Y','Z'], 20, axis=0),)
df
I want to run kruskal wallis test
and Dunn test
for each column col_1
to col_84
# copy and paste the libraries below
from scipy.stats import kruskal
pip install scikit-posthocs
import scikit_posthocs as sp
# filtering for each Type X,Y and Z
# for column_1
# Extract values for each group
group_x_values = df[df['Type'] == 'X']['column_1'].values
group_y_values = df[df['Type'] == 'Y']['column_1'].values
group_z_values = df[df['Type'] == 'Z']['column_1'].values
# 1st stats test : Kruskal wallis
h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)
# Print the results
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}")
# 2nd stats test: Dunn test
data = [df[df['Group'] == 'X']['column_1'].values,
df[df['Group'] == 'Y']['column_1'].values,
df[df['Group'] == 'Z']['column_1'].values]
p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')
print(p_values)
# for column_2
group_x_values = df[df['Type'] == 'X']['column_2'].values
group_y_values = df[df['Type'] == 'Y']['column_2'].values
group_z_values = df[df['Type'] == 'Z']['column_2'].values
# 1st stats test : Kruskal wallis
h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)
# Print the results
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}")
# 2nd stats test: Dunn test
data = [df[df['Group'] == 'X']['column_2'].values,
df[df['Group'] == 'Y']['column_2'].values,
df[df['Group'] == 'Z']['column_2'].values]
p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')
print(p_values)
.
.
.
#for column_84
group_x_values = df[df['Type'] == 'X']['column_84'].values
group_y_values = df[df['Type'] == 'Y']['column_84'].values
group_z_values = df[df['Type'] == 'Z']['column_84'].values
# 1st stats test : Kruskal wallis
h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)
# Print the results
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}")
# 2nd stats test: Dunn test
data = [df[df['Group'] == 'X']['column_84'].values,
df[df['Group'] == 'Y']['column_84'].values,
df[df['Group'] == 'Z']['column_84'].values]
p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')
print(p_values)
I want to export both results to excel worksheet, something like this:
Kruskal Worksheet
Dunn Worksheet
Few things to adjust here. First, try not to duplicate code. You are essentially running the same process on different columns. Just loop through the columns you want it to process instead of writing the same code multiple times (Are you really going to duplicate that code 84 times for columns 1-84)? Make it dynamic. Second thing, you'll need to use pandas to transform your data. You can then create your final output dataframes to write to excel. Use pandas ExcelWriter to write to workbook.
from scipy.stats import kruskal
#pip install scikit-posthocs
import scikit_posthocs as sp
columns_list_num = df.select_dtypes(include='number').columns.tolist()
kruskal_data = {}
for col in columns_list_num:
kruskal_data.update({f'{col}': {}})
dunns_data_dfs = []
# filtering for each Type X,Y and Z
for col in columns_list_num:
# Extract values for each group
group_x_values = df[df['Type'] == 'X'][f'{col}'].values
group_y_values = df[df['Type'] == 'Y'][f'{col}'].values
group_z_values = df[df['Type'] == 'Z'][f'{col}'].values
# 1st stats test : Kruskal wallis
h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)
kruskal_data[f'{col}'].update({'H-statistic':h_statistic})
kruskal_data[f'{col}'].update({'P-value':p_value})
# Print the results
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}\n")
# 2nd stats test: Dunn test
data = [group_x_values,
group_y_values,
group_z_values]
p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')
p_values.index = ['X', 'Y', 'Z']
p_values.columns = ['X', 'Y', 'Z']
pairwise_data = {
'Dunns Results' : f'{col}',
'X vs Y': p_values.loc['X', 'Y'],
'Y vs Z': p_values.loc['Y', 'Z'],
'X vs Z': p_values.loc['X', 'Z']
}
dunns_data_dfs.append(pairwise_data)
print('\n',p_values)
kruskal_data_df = pd.DataFrame.from_dict(kruskal_data, orient='index').reset_index(False).rename(columns={'index': 'Kruskal Results'})
dunns_data_df = pd.DataFrame(dunns_data_dfs)
excel_file_path = "statistical_test_results.xlsx"
with pd.ExcelWriter(excel_file_path) as writer:
kruskal_data_df.to_excel(writer, sheet_name="Kruskal Results", index=False)
dunns_data_df.to_excel(writer, sheet_name="Dunns Results", index=False)