pythonpandasnumpyscipyscikits

In Python, How to run two statistical tests on all numeric columns


I have a dataframe df, I want to do the following:

  1. run two stats tests on all the numeric columns (column_1 to column_84) to compare if there is a statistical difference between Types X, Y and Z
  1. Export the results to excel spreadsheet ( see screenshot below)
# copy & paste

## generate dataframe "df"

import pandas as pd
import numpy as np

df = pd.DataFrame(
    data=np.random.uniform(low=5.5, high=30.75, size=(60, 84)),
    columns=[f'column_{i}' for i in range(1, 85)],)

df.insert(loc=0, column='Type',value=np.repeat(['X','Y','Z'], 20, axis=0),)

df

I want to run kruskal wallis test and Dunn test for each column col_1 to col_84

# copy and paste the libraries below
from scipy.stats import kruskal 
pip install scikit-posthocs 
import scikit_posthocs as sp 

# filtering for each Type X,Y and Z 

# for column_1
 # Extract values for each group
group_x_values = df[df['Type'] == 'X']['column_1'].values
group_y_values = df[df['Type'] == 'Y']['column_1'].values
group_z_values = df[df['Type'] == 'Z']['column_1'].values

# 1st stats test : Kruskal wallis

h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)

# Print the results
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}")


# 2nd stats test: Dunn test
data = [df[df['Group'] == 'X']['column_1'].values,
        df[df['Group'] == 'Y']['column_1'].values,
        df[df['Group'] == 'Z']['column_1'].values]

p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')

print(p_values)

# for column_2

group_x_values = df[df['Type'] == 'X']['column_2'].values
group_y_values = df[df['Type'] == 'Y']['column_2'].values
group_z_values = df[df['Type'] == 'Z']['column_2'].values

# 1st stats test : Kruskal wallis
h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)
# Print the results
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}")

# 2nd stats test: Dunn test
data = [df[df['Group'] == 'X']['column_2'].values,
        df[df['Group'] == 'Y']['column_2'].values,
        df[df['Group'] == 'Z']['column_2'].values]
p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')

print(p_values)
.
.
.
#for column_84
group_x_values = df[df['Type'] == 'X']['column_84'].values
group_y_values = df[df['Type'] == 'Y']['column_84'].values
group_z_values = df[df['Type'] == 'Z']['column_84'].values
# 1st stats test : Kruskal wallis
h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)
# Print the results
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}")

# 2nd stats test: Dunn test
data = [df[df['Group'] == 'X']['column_84'].values,
        df[df['Group'] == 'Y']['column_84'].values,
        df[df['Group'] == 'Z']['column_84'].values]

p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')

print(p_values)

I want to export both results to excel worksheet, something like this:

Kruskal Worksheet

enter image description here

Dunn Worksheet

enter image description here


Solution

  • Few things to adjust here. First, try not to duplicate code. You are essentially running the same process on different columns. Just loop through the columns you want it to process instead of writing the same code multiple times (Are you really going to duplicate that code 84 times for columns 1-84)? Make it dynamic. Second thing, you'll need to use pandas to transform your data. You can then create your final output dataframes to write to excel. Use pandas ExcelWriter to write to workbook.

    from scipy.stats import kruskal 
    #pip install scikit-posthocs 
    import scikit_posthocs as sp 
    
    
    columns_list_num = df.select_dtypes(include='number').columns.tolist()
    
    kruskal_data = {}
    for col in columns_list_num:
        kruskal_data.update({f'{col}': {}})
        
    dunns_data_dfs = []
    
    # filtering for each Type X,Y and Z 
    
    for col in columns_list_num:
    
         # Extract values for each group
        group_x_values = df[df['Type'] == 'X'][f'{col}'].values
        group_y_values = df[df['Type'] == 'Y'][f'{col}'].values
        group_z_values = df[df['Type'] == 'Z'][f'{col}'].values
        
        # 1st stats test : Kruskal wallis
        h_statistic, p_value = kruskal(group_x_values, group_y_values, group_z_values)
        
        kruskal_data[f'{col}'].update({'H-statistic':h_statistic})
        kruskal_data[f'{col}'].update({'P-value':p_value})
    
        # Print the results
        print(f"H-statistic: {h_statistic}")
        print(f"P-value: {p_value}\n")
    
    
    
        # 2nd stats test: Dunn test
        data = [group_x_values,
                group_y_values,
                group_z_values]
        
        p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')
        p_values.index = ['X', 'Y', 'Z']
        p_values.columns = ['X', 'Y', 'Z']
        pairwise_data = {
        'Dunns Results' : f'{col}',
        'X vs Y': p_values.loc['X', 'Y'],
        'Y vs Z': p_values.loc['Y', 'Z'],
        'X vs Z': p_values.loc['X', 'Z']
    }
        
        dunns_data_dfs.append(pairwise_data)
    
        print('\n',p_values)
    
    kruskal_data_df = pd.DataFrame.from_dict(kruskal_data, orient='index').reset_index(False).rename(columns={'index': 'Kruskal Results'})
    dunns_data_df = pd.DataFrame(dunns_data_dfs)
    
    
    excel_file_path = "statistical_test_results.xlsx"
    with pd.ExcelWriter(excel_file_path) as writer:
        kruskal_data_df.to_excel(writer, sheet_name="Kruskal Results", index=False)
        dunns_data_df.to_excel(writer, sheet_name="Dunns Results", index=False)
    

    enter image description here

    enter image description here